From 691be243f6a1b78f4bd1fc1aeaafeae893099446 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 18 Feb 2021 16:34:25 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Use=20MAX=20file=20group=20name=20i?= =?UTF-8?q?nstead=20of=20BEST?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were using the file group name BEST for what Kitodo seems to call MAX by convention. So we use MAX now. Currently, we work under the assumption that, if MAX exists in the METS retrieved by OAI-PMH, it's not what we want and we replace it with our own IIIF URLS with full size. Fixes GH-43. --- README.md | 2 +- ppn2ocr | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b76a8b5..65e6e3e 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ The document must be specified by its PPN, for example: ~~~ ~/devel/ocrd-galley/ppn2ocr PPN77164308X cd PPN77164308X -~/devel/ocrd-galley/my_ocrd_workflow -I BEST --skip-validation +~/devel/ocrd-galley/my_ocrd_workflow -I MAX --skip-validation ~~~ This produces a workspace directory `PPN77164308X` with the OCR results in it; diff --git a/ppn2ocr b/ppn2ocr index dd5ffe8..e7184e3 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -91,15 +91,20 @@ def make_workspace(ppn, workspace): remove_file_grp(mets, 'PRESENTATION') remove_file_grp(mets, 'LOCAL') - # Duplicate DEFAULT file group into a new file group BEST + + # Delete MAX file group - we assume that, if it exists, it is not as + # we expect it, e.g. IIIF full URLs + remove_file_grp(mets, 'MAX') + + # Duplicate DEFAULT file group into a new file group MAX format_ = 'tif' file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS) file_grp_best = deepcopy(file_grp_default) - file_grp_best.attrib['USE'] = 'BEST' + file_grp_best.attrib['USE'] = 'MAX' for f in file_grp_best.findall('./mets:file', namespaces=XMLNS): old_id = f.attrib['ID'] - new_id = re.sub('DEFAULT', 'BEST', old_id) + new_id = re.sub('DEFAULT', 'MAX', old_id) f.attrib['ID'] = new_id f.attrib['MIMETYPE'] = mime_type_for_format(format_) @@ -157,7 +162,7 @@ def ppn2ocr(ppn): # XXX # subprocess.run([ # os.path.join(self_dir, 'run-docker-hub'), - # '-I', 'BEST', + # '-I', 'MAX', # '--skip-validation' # ])