mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-08 11:20:07 +02:00
✨ ALTO: Count alto:Tags
This commit is contained in:
parent
de50f13043
commit
a40716a320
2 changed files with 20 additions and 1 deletions
|
@ -86,7 +86,8 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
elif localname == 'Styles':
|
elif localname == 'Styles':
|
||||||
pass
|
pass
|
||||||
elif localname == 'Tags':
|
elif localname == 'Tags':
|
||||||
pass
|
value[localname] = {}
|
||||||
|
value[localname].update(TagGroup(tag, group).subelement_counts())
|
||||||
else:
|
else:
|
||||||
if raise_errors:
|
if raise_errors:
|
||||||
print(value)
|
print(value)
|
||||||
|
|
|
@ -37,3 +37,21 @@ def test_Page_counts():
|
||||||
assert d['Layout_Page_TextBlock-count'] == 1
|
assert d['Layout_Page_TextBlock-count'] == 1
|
||||||
assert d['Layout_Page_TextLine-count'] == 3
|
assert d['Layout_Page_TextLine-count'] == 3
|
||||||
assert d['Layout_Page_String-count'] == 6
|
assert d['Layout_Page_String-count'] == 6
|
||||||
|
|
||||||
|
def test_Tags_counts():
|
||||||
|
d = dict_fromstring("""
|
||||||
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
|
<Tags>
|
||||||
|
<NamedEntityTag ID="PER0" LABEL="Pentlings"/>
|
||||||
|
<NamedEntityTag ID="LOC1" LABEL="Pentling"/>
|
||||||
|
<NamedEntityTag ID="LOC2" LABEL="Hamm"/>
|
||||||
|
<NamedEntityTag ID="PER4" LABEL="Hofes Pentling"/>
|
||||||
|
<NamedEntityTag ID="LOC5" LABEL="Hofs Pentling"/>
|
||||||
|
<NamedEntityTag ID="LOC7" LABEL="Hilbeck"/>
|
||||||
|
<NamedEntityTag ID="PER8" LABEL="Hoff"/>
|
||||||
|
<NamedEntityTag ID="PER9" LABEL="L i b e r"/>
|
||||||
|
<NamedEntityTag ID="PER10" LABEL="Jhesu Christi"/>
|
||||||
|
</Tags>
|
||||||
|
</alto>
|
||||||
|
""")
|
||||||
|
assert d['Tags_NamedEntityTag-count'] == 9
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue