mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-03 19:04:13 +01:00 
			
		
		
		
	✨ ALTO: Support more ALTO versions
This commit is contained in:
		
							parent
							
								
									c91c9b1714
								
							
						
					
					
						commit
						c85356bd23
					
				
					 2 changed files with 43 additions and 38 deletions
				
			
		| 
						 | 
				
			
			@ -31,50 +31,50 @@ def alto_to_dict(alto, raise_errors=True):
 | 
			
		|||
    for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
 | 
			
		||||
        group = list(group)
 | 
			
		||||
 | 
			
		||||
        # XXX Namespaces seem to use a trailing / sometimes, sometimes not.
 | 
			
		||||
        #     (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
 | 
			
		||||
        if tag == '{http://www.loc.gov/standards/alto/ns-v2#}Description':
 | 
			
		||||
            value['Description'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}MeasurementUnit':
 | 
			
		||||
            value['MeasurementUnit'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}OCRProcessing':
 | 
			
		||||
            value['OCRProcessing'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}ocrProcessingStep':
 | 
			
		||||
        localname = ET.QName(tag).localname
 | 
			
		||||
 | 
			
		||||
        if localname == 'Description':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
			
		||||
        elif localname == 'MeasurementUnit':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'OCRProcessing':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
 | 
			
		||||
        elif localname == 'ocrProcessingStep':
 | 
			
		||||
            for n, e in enumerate(group):
 | 
			
		||||
                value['ocrProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}preProcessingStep':
 | 
			
		||||
                value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
 | 
			
		||||
        elif localname == 'preProcessingStep':
 | 
			
		||||
            # TODO This enumerated descent is used more than once, DRY!
 | 
			
		||||
            for n, e in enumerate(group):
 | 
			
		||||
                value['preProcessingStep{}'.format(n)] = alto_to_dict(e, raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingDateTime':
 | 
			
		||||
            value['processingDateTime'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingSoftware':
 | 
			
		||||
            value['processingSoftware'] = TagGroup(tag, group).is_singleton().descend(raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingAgency':
 | 
			
		||||
            value['processingAgency'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepDescription':
 | 
			
		||||
            value['processingStepDescription'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}processingStepSettings':
 | 
			
		||||
            value['processingStepSettings'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareCreator':
 | 
			
		||||
            value['softwareCreator'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareName':
 | 
			
		||||
            value['softwareName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}softwareVersion':
 | 
			
		||||
            value['softwareVersion'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
                value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
 | 
			
		||||
        elif localname == 'processingDateTime':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'processingSoftware':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
 | 
			
		||||
        elif localname == 'processingAgency':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'processingStepDescription':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'processingStepSettings':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'softwareCreator':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'softwareName':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'softwareVersion':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}sourceImageInformation':
 | 
			
		||||
            value['sourceImageInformation'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}fileName':
 | 
			
		||||
            value['fileName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
        elif localname == 'sourceImageInformation':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
			
		||||
        elif localname == 'fileName':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 | 
			
		||||
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Layout':
 | 
			
		||||
            value['Layout'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Page':
 | 
			
		||||
        elif localname == 'Layout':
 | 
			
		||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
			
		||||
        elif localname == 'Page':
 | 
			
		||||
            value['Page'] = {}
 | 
			
		||||
            value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
 | 
			
		||||
            value['Page'].update(TagGroup(tag, group).subelement_counts())
 | 
			
		||||
        elif tag == '{http://www.loc.gov/standards/alto/ns-v2#}Styles':
 | 
			
		||||
        elif localname == 'Styles':
 | 
			
		||||
            pass
 | 
			
		||||
        else:
 | 
			
		||||
            if raise_errors:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -191,7 +191,9 @@ class TagGroup:
 | 
			
		|||
        """
 | 
			
		||||
        attrib = {}
 | 
			
		||||
        for e in self.group:
 | 
			
		||||
            attrib.update(e.attrib)
 | 
			
		||||
            for a, v in e.attrib.items():
 | 
			
		||||
                a_localname = ET.QName(a).localname
 | 
			
		||||
                attrib[a_localname] = v
 | 
			
		||||
        return attrib
 | 
			
		||||
 | 
			
		||||
    def subelement_counts(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -224,7 +226,10 @@ def _to_dict(root, raise_errors):
 | 
			
		|||
        return mods_to_dict(root, raise_errors)
 | 
			
		||||
    elif root_name.namespace == "http://www.loc.gov/METS/":
 | 
			
		||||
        return mets_to_dict(root, raise_errors)
 | 
			
		||||
    elif root_name.namespace == "http://www.loc.gov/standards/alto/ns-v2#":
 | 
			
		||||
    elif root_name.namespace in [
 | 
			
		||||
        "http://www.loc.gov/standards/alto/ns-v2#",
 | 
			
		||||
        "http://www.loc.gov/standards/alto/",
 | 
			
		||||
    ]:
 | 
			
		||||
        return alto_to_dict(root, raise_errors)
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError(f"Unknown namespace {root_name.namespace}")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue