ALTO: Support more ALTO versions

master
Gerber, Mike 3 years ago
parent c91c9b1714
commit c85356bd23

@ -31,50 +31,50 @@ def alto_to_dict(alto, raise_errors=True):
for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
group = list(group)
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description':
value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit':
value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing':
value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep':
localname = ET.QName(tag).localname
if localname == 'Description':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif localname == 'MeasurementUnit':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'OCRProcessing':
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
elif localname == 'ocrProcessingStep':
for n, e in enumerate(group):
value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep':
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
elif localname == 'preProcessingStep':
# TODO This enumerated descent is used more than once, DRY!
for n, e in enumerate(group):
value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime':
value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware':
value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency':
value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription':
value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings':
value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator':
value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName':
value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion':
value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation':
value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName':
value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout':
value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page':
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
elif localname == 'processingDateTime':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'processingSoftware':
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
elif localname == 'processingAgency':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'processingStepDescription':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'processingStepSettings':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'softwareCreator':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'softwareName':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'softwareVersion':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'sourceImageInformation':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif localname == 'fileName':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
elif localname == 'Layout':
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
elif localname == 'Page':
value['Page'] = {}
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
value['Page'].update(TagGroup(tag, group).subelement_counts())
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles':
elif localname == 'Styles':
pass
else:
if raise_errors:

@ -191,7 +191,9 @@ class TagGroup:
"""
attrib = {}
for e in self.group:
attrib.update(e.attrib)
for a, v in e.attrib.items():
a_localname = ET.QName(a).localname
attrib[a_localname] = v
return attrib
def subelement_counts(self):
@ -224,7 +226,10 @@ def _to_dict(root, raise_errors):
return mods_to_dict(root, raise_errors)
elif root_name.namespace == "http://www.loc.gov/METS/":
return mets_to_dict(root, raise_errors)
elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#":
elif root_name.namespace in [
"http://www.loc.gov/standards/alto/ns-v2#",
"http://www.loc.gov/standards/alto/",
]:
return alto_to_dict(root, raise_errors)
else:
raise ValueError(f"Unknown namespace {root_name.namespace}")

Loading…
Cancel
Save