mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
✨ ALTO: Count Layout/Page/* elements
This commit is contained in:
parent
1c62085612
commit
10b8023dd6
2 changed files with 40 additions and 1 deletions
|
@ -198,7 +198,7 @@ class TagGroup:
|
||||||
counts = {}
|
counts = {}
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
for x in e.iter():
|
for x in e.iter():
|
||||||
tag = ET.QName(x).localname
|
tag = ET.QName(x.tag).localname
|
||||||
key = f"{tag}-count"
|
key = f"{tag}-count"
|
||||||
counts[key] = counts.get(key, 0) + 1
|
counts[key] = counts.get(key, 0) + 1
|
||||||
return counts
|
return counts
|
||||||
|
|
39
qurator/modstool/tests/test_alto.py
Normal file
39
qurator/modstool/tests/test_alto.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
|
from qurator.modstool.altotool import alto_to_dict
|
||||||
|
from qurator.modstool.lib import flatten
|
||||||
|
|
||||||
|
|
||||||
|
def dict_fromstring(x):
|
||||||
|
return flatten(alto_to_dict(ET.fromstring(x)))
|
||||||
|
|
||||||
|
def test_Page_counts():
|
||||||
|
"""
|
||||||
|
Elements below Layout/Page should be counted
|
||||||
|
"""
|
||||||
|
d = dict_fromstring("""
|
||||||
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
|
<Layout>
|
||||||
|
<Page ID="Page1" PHYSICAL_IMG_NR="1">
|
||||||
|
<TextBlock ID="Page1_Block1">
|
||||||
|
<TextLine>
|
||||||
|
<String STYLE="bold" WC="0.8937500119" CONTENT="Staatsbibliothek" />
|
||||||
|
</TextLine>
|
||||||
|
<TextLine>
|
||||||
|
<String STYLE="bold" WC="0.8899999857" CONTENT="zu" />
|
||||||
|
<String STYLE="bold" WC="0.9866666794" CONTENT="Berlin" />
|
||||||
|
</TextLine>
|
||||||
|
<TextLine>
|
||||||
|
<String STYLE="bold" WC="1." CONTENT="WM" />
|
||||||
|
<String STYLE="bold" WC="0.8927272558" CONTENT="Preußischer" />
|
||||||
|
<String STYLE="bold" WC="0.9058333039" CONTENT="Kulturbesitz" />
|
||||||
|
</TextLine>
|
||||||
|
</TextBlock>
|
||||||
|
</Page>
|
||||||
|
</Layout>
|
||||||
|
</alto>
|
||||||
|
""")
|
||||||
|
assert d['Layout_Page_TextBlock-count'] == 1
|
||||||
|
assert d['Layout_Page_TextLine-count'] == 3
|
||||||
|
assert d['Layout_Page_String-count'] == 6
|
Loading…
Add table
Add a link
Reference in a new issue