From c85356bd23bd3eded686210e1e11adb030af7404 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 17:46:50 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Support=20more=20ALTO=20ver?= =?UTF-8?q?sions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 76 ++++++++++++++++++------------------ qurator/modstool/lib.py | 9 ++++- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 431aa1e..8fd3635 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -31,50 +31,50 @@ def alto_to_dict(alto, raise_errors=True): for tag, group in sorted_groupby(alto, key=attrgetter('tag')): group = list(group) - # XXX Namespaces seem to use a trailing / sometimes, sometimes not. - # (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS}) - if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description': - value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit': - value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing': - value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep': + localname = ET.QName(tag).localname + + if localname == 'Description': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'MeasurementUnit': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'OCRProcessing': + value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'ocrProcessingStep': for n, e in enumerate(group): - value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep': + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'preProcessingStep': # TODO This enumerated descent is used more than once, DRY! for n, e in enumerate(group): - value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime': - value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware': - value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency': - value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription': - value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings': - value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator': - value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName': - value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion': - value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation': - value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName': - value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() - - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout': - value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page': + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'processingDateTime': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingSoftware': + value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'processingAgency': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingStepDescription': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingStepSettings': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareCreator': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareName': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareVersion': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif localname == 'sourceImageInformation': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'fileName': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif localname == 'Layout': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'Page': value['Page'] = {} value['Page'].update(TagGroup(tag, group).is_singleton().attributes()) value['Page'].update(TagGroup(tag, group).subelement_counts()) - elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles': + elif localname == 'Styles': pass else: if raise_errors: diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 5ebf0ac..4d00510 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -191,7 +191,9 @@ class TagGroup: """ attrib = {} for e in self.group: - attrib.update(e.attrib) + for a, v in e.attrib.items(): + a_localname = ET.QName(a).localname + attrib[a_localname] = v return attrib def subelement_counts(self): @@ -224,7 +226,10 @@ def _to_dict(root, raise_errors): return mods_to_dict(root, raise_errors) elif root_name.namespace == "http://www.loc.gov/METS/": return mets_to_dict(root, raise_errors) - elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#": + elif root_name.namespace in [ + "http://www.loc.gov/standards/alto/ns-v2#", + "http://www.loc.gov/standards/alto/", + ]: return alto_to_dict(root, raise_errors) else: raise ValueError(f"Unknown namespace {root_name.namespace}")