mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
✨ ALTO: Support more ALTO versions
This commit is contained in:
parent
c91c9b1714
commit
c85356bd23
2 changed files with 43 additions and 38 deletions
|
@ -31,50 +31,50 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
|
for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
|
||||||
group = list(group)
|
group = list(group)
|
||||||
|
|
||||||
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
|
localname = ET.QName(tag).localname
|
||||||
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
|
|
||||||
if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description':
|
if localname == 'Description':
|
||||||
value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit':
|
elif localname == 'MeasurementUnit':
|
||||||
value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing':
|
elif localname == 'OCRProcessing':
|
||||||
value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep':
|
elif localname == 'ocrProcessingStep':
|
||||||
for n, e in enumerate(group):
|
for n, e in enumerate(group):
|
||||||
value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
|
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep':
|
elif localname == 'preProcessingStep':
|
||||||
# TODO This enumerated descent is used more than once, DRY!
|
# TODO This enumerated descent is used more than once, DRY!
|
||||||
for n, e in enumerate(group):
|
for n, e in enumerate(group):
|
||||||
value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
|
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime':
|
elif localname == 'processingDateTime':
|
||||||
value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware':
|
elif localname == 'processingSoftware':
|
||||||
value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency':
|
elif localname == 'processingAgency':
|
||||||
value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription':
|
elif localname == 'processingStepDescription':
|
||||||
value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings':
|
elif localname == 'processingStepSettings':
|
||||||
value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator':
|
elif localname == 'softwareCreator':
|
||||||
value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName':
|
elif localname == 'softwareName':
|
||||||
value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion':
|
elif localname == 'softwareVersion':
|
||||||
value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation':
|
elif localname == 'sourceImageInformation':
|
||||||
value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName':
|
elif localname == 'fileName':
|
||||||
value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout':
|
elif localname == 'Layout':
|
||||||
value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page':
|
elif localname == 'Page':
|
||||||
value['Page'] = {}
|
value['Page'] = {}
|
||||||
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
|
value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
|
||||||
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
value['Page'].update(TagGroup(tag, group).subelement_counts())
|
||||||
elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles':
|
elif localname == 'Styles':
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if raise_errors:
|
if raise_errors:
|
||||||
|
|
|
@ -191,7 +191,9 @@ class TagGroup:
|
||||||
"""
|
"""
|
||||||
attrib = {}
|
attrib = {}
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
attrib.update(e.attrib)
|
for a, v in e.attrib.items():
|
||||||
|
a_localname = ET.QName(a).localname
|
||||||
|
attrib[a_localname] = v
|
||||||
return attrib
|
return attrib
|
||||||
|
|
||||||
def subelement_counts(self):
|
def subelement_counts(self):
|
||||||
|
@ -224,7 +226,10 @@ def _to_dict(root, raise_errors):
|
||||||
return mods_to_dict(root, raise_errors)
|
return mods_to_dict(root, raise_errors)
|
||||||
elif root_name.namespace == "http://www.loc.gov/METS/":
|
elif root_name.namespace == "http://www.loc.gov/METS/":
|
||||||
return mets_to_dict(root, raise_errors)
|
return mets_to_dict(root, raise_errors)
|
||||||
elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#":
|
elif root_name.namespace in [
|
||||||
|
"http://www.loc.gov/standards/alto/ns-v2#",
|
||||||
|
"http://www.loc.gov/standards/alto/",
|
||||||
|
]:
|
||||||
return alto_to_dict(root, raise_errors)
|
return alto_to_dict(root, raise_errors)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue