mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 03:14:14 +01:00 
			
		
		
		
	✨ Count all alto:String elements with TAGREFS attribute
This commit is contained in:
		
							parent
							
								
									a40716a320
								
							
						
					
					
						commit
						c803ce0907
					
				
					 3 changed files with 38 additions and 0 deletions
				
			
		| 
						 | 
					@ -83,6 +83,9 @@ def alto_to_dict(alto, raise_errors=True):
 | 
				
			||||||
            value[localname].update(TagGroup(tag, group).subelement_counts())
 | 
					            value[localname].update(TagGroup(tag, group).subelement_counts())
 | 
				
			||||||
            value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
 | 
					            value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Count all alto:String elements with TAGREFS attribute
 | 
				
			||||||
 | 
					            value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        elif localname == 'Styles':
 | 
					        elif localname == 'Styles':
 | 
				
			||||||
            pass
 | 
					            pass
 | 
				
			||||||
        elif localname == 'Tags':
 | 
					        elif localname == 'Tags':
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -229,6 +229,17 @@ class TagGroup:
 | 
				
			||||||
            statistics[f'{xpath_expr}-max'] = np.max(values)
 | 
					            statistics[f'{xpath_expr}-max'] = np.max(values)
 | 
				
			||||||
        return statistics
 | 
					        return statistics
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def xpath_count(self, xpath_expr, namespaces):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Count all elements matching xpath_expr
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        values = []
 | 
				
			||||||
 | 
					        for e in self.group:
 | 
				
			||||||
 | 
					            r = e.xpath(xpath_expr, namespaces=namespaces)
 | 
				
			||||||
 | 
					            values += r
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        counts = {f'{xpath_expr}-count': len(values)}
 | 
				
			||||||
 | 
					        return counts
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,3 +55,27 @@ def test_Tags_counts():
 | 
				
			||||||
    </alto>
 | 
					    </alto>
 | 
				
			||||||
    """)
 | 
					    """)
 | 
				
			||||||
    assert d['Tags_NamedEntityTag-count'] == 9
 | 
					    assert d['Tags_NamedEntityTag-count'] == 9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_String_TAGREF_counts():
 | 
				
			||||||
 | 
					    d = dict_fromstring("""
 | 
				
			||||||
 | 
					    <alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
 | 
				
			||||||
 | 
					      <Layout>
 | 
				
			||||||
 | 
					      <Page>
 | 
				
			||||||
 | 
					      <PrintSpace>
 | 
				
			||||||
 | 
					      <TextBlock>
 | 
				
			||||||
 | 
					        <TextLine>
 | 
				
			||||||
 | 
					          <String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
 | 
				
			||||||
 | 
					        </TextLine>
 | 
				
			||||||
 | 
					        <TextLine>
 | 
				
			||||||
 | 
					          <String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
 | 
				
			||||||
 | 
					          <String CONTENT="Pentlings" HEIGHT="33" HPOS="330" TAGREFS="PER0" VPOS="699" WC="0.4511111081" WIDTH="146"/>
 | 
				
			||||||
 | 
					          <String CONTENT="No TAGREF!" />
 | 
				
			||||||
 | 
					        </TextLine>
 | 
				
			||||||
 | 
					      </TextBlock>
 | 
				
			||||||
 | 
					      </PrintSpace>
 | 
				
			||||||
 | 
					      </Page>
 | 
				
			||||||
 | 
					      </Layout>
 | 
				
			||||||
 | 
					    </alto>
 | 
				
			||||||
 | 
					    """)
 | 
				
			||||||
 | 
					    assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
 | 
				
			||||||
 | 
					    assert d['Layout_Page_String-count'] == 4
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue