diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index fee1f73..2bd73c1 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -55,8 +55,9 @@ def alto_to_dict(alto, raise_errors=True): elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page': - value['Page'] = TagGroup(tag, group).is_singleton().attributes() - # TODO subelements + value['Page'] = {} + value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) + value['Page'].update(TagGroup(tag, group).subelement_counts()) elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles': pass else: diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index c4ff8b0..9f01be8 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -194,6 +194,16 @@ class TagGroup: attrib.update(e.attrib) return attrib + def subelement_counts(self): + counts = {} + for e in self.group: + for x in e.iter(): + tag = ET.QName(x).localname + key = f"{tag}-count" + counts[key] = counts.get(key, 0) + 1 + return counts + + def sorted_groupby(iterable, key=None): """