ALTO: Count alto:Tags

master
Gerber, Mike 2 years ago
parent de50f13043
commit a40716a320

@ -86,7 +86,8 @@ def alto_to_dict(alto, raise_errors=True):
elif localname == 'Styles':
pass
elif localname == 'Tags':
pass
value[localname] = {}
value[localname].update(TagGroup(tag, group).subelement_counts())
else:
if raise_errors:
print(value)

@ -37,3 +37,21 @@ def test_Page_counts():
assert d['Layout_Page_TextBlock-count'] == 1
assert d['Layout_Page_TextLine-count'] == 3
assert d['Layout_Page_String-count'] == 6
def test_Tags_counts():
d = dict_fromstring("""
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
<Tags>
<NamedEntityTag ID="PER0" LABEL="Pentlings"/>
<NamedEntityTag ID="LOC1" LABEL="Pentling"/>
<NamedEntityTag ID="LOC2" LABEL="Hamm"/>
<NamedEntityTag ID="PER4" LABEL="Hofes Pentling"/>
<NamedEntityTag ID="LOC5" LABEL="Hofs Pentling"/>
<NamedEntityTag ID="LOC7" LABEL="Hilbeck"/>
<NamedEntityTag ID="PER8" LABEL="Hoff"/>
<NamedEntityTag ID="PER9" LABEL="L i b e r"/>
<NamedEntityTag ID="PER10" LABEL="Jhesu Christi"/>
</Tags>
</alto>
""")
assert d['Tags_NamedEntityTag-count'] == 9

Loading…
Cancel
Save