ALTO: Count Layout/Page/* elements

master
Gerber, Mike 3 years ago
parent c9737683b1
commit 1c62085612

@ -55,8 +55,9 @@ def alto_to_dict(alto, raise_errors=True):
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout':
value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page':
value['Page'] = TagGroup(tag, group).is_singleton().attributes()
# TODO subelements
value['Page'] = {}
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
value['Page'].update(TagGroup(tag, group).subelement_counts())
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles':
pass
else:

@ -194,6 +194,16 @@ class TagGroup:
attrib.update(e.attrib)
return attrib
def subelement_counts(self):
counts = {}
for e in self.group:
for x in e.iter():
tag = ET.QName(x).localname
key = f"{tag}-count"
counts[key] = counts.get(key, 0) + 1
return counts
def sorted_groupby(iterable, key=None):
"""

Loading…
Cancel
Save