From 9b3db1cd1d1cf8b595d53e1aff2703d9fee742a0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 May 2022 19:32:26 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20ALTO:=20Support=20more=20ALTO=20ver?= =?UTF-8?q?sions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/altotool.py | 7 ++++++- qurator/modstool/lib.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/qurator/modstool/altotool.py b/qurator/modstool/altotool.py index 3115f67..1cecc03 100755 --- a/qurator/modstool/altotool.py +++ b/qurator/modstool/altotool.py @@ -39,11 +39,14 @@ def alto_to_dict(alto, raise_errors=True): value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() elif localname == 'OCRProcessing': value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'Processing': + # TODO This enumerated descent is used more than once, DRY! + for n, e in enumerate(group): + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) elif localname == 'ocrProcessingStep': for n, e in enumerate(group): value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) elif localname == 'preProcessingStep': - # TODO This enumerated descent is used more than once, DRY! for n, e in enumerate(group): value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) elif localname == 'processingDateTime': @@ -76,6 +79,8 @@ def alto_to_dict(alto, raise_errors=True): value['Page'].update(TagGroup(tag, group).subelement_counts()) elif localname == 'Styles': pass + elif localname == 'Tags': + pass else: if raise_errors: print(value) diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py index 1e8b560..75d0f86 100644 --- a/qurator/modstool/lib.py +++ b/qurator/modstool/lib.py @@ -231,6 +231,7 @@ def _to_dict(root, raise_errors): "http://schema.ccs-gmbh.com/ALTO", "http://www.loc.gov/standards/alto/", "http://www.loc.gov/standards/alto/ns-v2#", + "http://www.loc.gov/standards/alto/ns-v4#", ]: return alto_to_dict(root, raise_errors) else: