mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 11:24:14 +01:00 
			
		
		
		
	✨ ALTO: Count Layout/Page/* elements
This commit is contained in:
		
							parent
							
								
									1c62085612
								
							
						
					
					
						commit
						10b8023dd6
					
				
					 2 changed files with 40 additions and 1 deletions
				
			
		| 
						 | 
					@ -198,7 +198,7 @@ class TagGroup:
 | 
				
			||||||
        counts = {}
 | 
					        counts = {}
 | 
				
			||||||
        for e in self.group:
 | 
					        for e in self.group:
 | 
				
			||||||
            for x in e.iter():
 | 
					            for x in e.iter():
 | 
				
			||||||
                tag = ET.QName(x).localname
 | 
					                tag = ET.QName(x.tag).localname
 | 
				
			||||||
                key = f"{tag}-count"
 | 
					                key = f"{tag}-count"
 | 
				
			||||||
                counts[key] = counts.get(key, 0) + 1
 | 
					                counts[key] = counts.get(key, 0) + 1
 | 
				
			||||||
        return counts
 | 
					        return counts
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										39
									
								
								qurator/modstool/tests/test_alto.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								qurator/modstool/tests/test_alto.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,39 @@
 | 
				
			||||||
 | 
					import xml.etree.ElementTree as ET
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from qurator.modstool.altotool import alto_to_dict
 | 
				
			||||||
 | 
					from qurator.modstool.lib import flatten
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def dict_fromstring(x):
 | 
				
			||||||
 | 
					   return flatten(alto_to_dict(ET.fromstring(x)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_Page_counts():
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Elements below Layout/Page should be counted
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    d = dict_fromstring("""
 | 
				
			||||||
 | 
					    <alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
 | 
				
			||||||
 | 
					      <Layout>
 | 
				
			||||||
 | 
					        <Page ID="Page1" PHYSICAL_IMG_NR="1">
 | 
				
			||||||
 | 
					            <TextBlock ID="Page1_Block1">
 | 
				
			||||||
 | 
					              <TextLine>
 | 
				
			||||||
 | 
					                <String STYLE="bold" WC="0.8937500119" CONTENT="Staatsbibliothek" />
 | 
				
			||||||
 | 
					              </TextLine>
 | 
				
			||||||
 | 
					              <TextLine>
 | 
				
			||||||
 | 
					                <String STYLE="bold" WC="0.8899999857" CONTENT="zu" />
 | 
				
			||||||
 | 
					                <String STYLE="bold" WC="0.9866666794" CONTENT="Berlin" />
 | 
				
			||||||
 | 
					              </TextLine>
 | 
				
			||||||
 | 
					              <TextLine>
 | 
				
			||||||
 | 
					                <String STYLE="bold" WC="1." CONTENT="WM" />
 | 
				
			||||||
 | 
					                <String STYLE="bold" WC="0.8927272558" CONTENT="Preußischer" />
 | 
				
			||||||
 | 
					                <String STYLE="bold" WC="0.9058333039" CONTENT="Kulturbesitz" />
 | 
				
			||||||
 | 
					              </TextLine>
 | 
				
			||||||
 | 
					            </TextBlock>
 | 
				
			||||||
 | 
					        </Page>
 | 
				
			||||||
 | 
					      </Layout>
 | 
				
			||||||
 | 
					    </alto>
 | 
				
			||||||
 | 
					    """)
 | 
				
			||||||
 | 
					    assert d['Layout_Page_TextBlock-count'] == 1
 | 
				
			||||||
 | 
					    assert d['Layout_Page_TextLine-count'] == 3
 | 
				
			||||||
 | 
					    assert d['Layout_Page_String-count'] == 6
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue