ALTO: Count Layout/Page/* elements

master
Gerber, Mike 3 years ago
parent 1c62085612
commit 10b8023dd6

@ -198,7 +198,7 @@ class TagGroup:
counts = {} counts = {}
for e in self.group: for e in self.group:
for x in e.iter(): for x in e.iter():
tag = ET.QName(x).localname tag = ET.QName(x.tag).localname
key = f"{tag}-count" key = f"{tag}-count"
counts[key] = counts.get(key, 0) + 1 counts[key] = counts.get(key, 0) + 1
return counts return counts

@ -0,0 +1,39 @@
import xml.etree.ElementTree as ET
from qurator.modstool.altotool import alto_to_dict
from qurator.modstool.lib import flatten
def dict_fromstring(x):
return flatten(alto_to_dict(ET.fromstring(x)))
def test_Page_counts():
"""
Elements below Layout/Page should be counted
"""
d = dict_fromstring("""
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
<Layout>
<Page ID="Page1" PHYSICAL_IMG_NR="1">
<TextBlock ID="Page1_Block1">
<TextLine>
<String STYLE="bold" WC="0.8937500119" CONTENT="Staatsbibliothek" />
</TextLine>
<TextLine>
<String STYLE="bold" WC="0.8899999857" CONTENT="zu" />
<String STYLE="bold" WC="0.9866666794" CONTENT="Berlin" />
</TextLine>
<TextLine>
<String STYLE="bold" WC="1." CONTENT="WM" />
<String STYLE="bold" WC="0.8927272558" CONTENT="Preußischer" />
<String STYLE="bold" WC="0.9058333039" CONTENT="Kulturbesitz" />
</TextLine>
</TextBlock>
</Page>
</Layout>
</alto>
""")
assert d['Layout_Page_TextBlock-count'] == 1
assert d['Layout_Page_TextLine-count'] == 3
assert d['Layout_Page_String-count'] == 6
Loading…
Cancel
Save