mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
✨ ALTO: Support more ALTO versions
This commit is contained in:
parent
c91c9b1714
commit
c85356bd23
2 changed files with 43 additions and 38 deletions
|
@ -31,50 +31,50 @@ def alto_to_dict(alto, raise_errors=True):
|
|||
for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
|
||||
group = list(group)
|
||||
|
||||
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
|
||||
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
|
||||
if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description':
|
||||
value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit':
|
||||
value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing':
|
||||
value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep':
|
||||
localname = ET.QName(tag).localname
|
||||
|
||||
if localname == 'Description':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||
elif localname == 'MeasurementUnit':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'OCRProcessing':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||
elif localname == 'ocrProcessingStep':
|
||||
for n, e in enumerate(group):
|
||||
value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep':
|
||||
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||
elif localname == 'preProcessingStep':
|
||||
# TODO This enumerated descent is used more than once, DRY!
|
||||
for n, e in enumerate(group):
|
||||
value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime':
|
||||
value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware':
|
||||
value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency':
|
||||
value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription':
|
||||
value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings':
|
||||
value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator':
|
||||
value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName':
|
||||
value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion':
|
||||
value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||
elif localname == 'processingDateTime':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'processingSoftware':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||
elif localname == 'processingAgency':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'processingStepDescription':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'processingStepSettings':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'softwareCreator':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'softwareName':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'softwareVersion':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation':
|
||||
value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName':
|
||||
value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
elif localname == 'sourceImageInformation':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||
elif localname == 'fileName':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout':
|
||||
value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page':
|
||||
elif localname == 'Layout':
|
||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||
elif localname == 'Page':
|
||||
value['Page'] = {}
|
||||
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
|
||||
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles':
|
||||
elif localname == 'Styles':
|
||||
pass
|
||||
else:
|
||||
if raise_errors:
|
||||
|
|
|
@ -191,7 +191,9 @@ class TagGroup:
|
|||
"""
|
||||
attrib = {}
|
||||
for e in self.group:
|
||||
attrib.update(e.attrib)
|
||||
for a, v in e.attrib.items():
|
||||
a_localname = ET.QName(a).localname
|
||||
attrib[a_localname] = v
|
||||
return attrib
|
||||
|
||||
def subelement_counts(self):
|
||||
|
@ -224,7 +226,10 @@ def _to_dict(root, raise_errors):
|
|||
return mods_to_dict(root, raise_errors)
|
||||
elif root_name.namespace == "http://www.loc.gov/METS/":
|
||||
return mets_to_dict(root, raise_errors)
|
||||
elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#":
|
||||
elif root_name.namespace in [
|
||||
"http://www.loc.gov/standards/alto/ns-v2#",
|
||||
"http://www.loc.gov/standards/alto/",
|
||||
]:
|
||||
return alto_to_dict(root, raise_errors)
|
||||
else:
|
||||
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue