From cae56e19db3b59d39c4bd2fda1c0a755ac649f00 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 25 Sep 2020 18:40:00 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Check=20image=20dimensions=20if?= =?UTF-8?q?=20FULLTEXT=20exists?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/ppn2ocr b/ppn2ocr index dd5ffe8..ca37b8c 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -8,6 +8,7 @@ import re import subprocess import click from copy import deepcopy +from collections import defaultdict XMLNS = { @@ -114,7 +115,31 @@ def make_workspace(ppn, workspace): url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_) flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full - mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best) + mets.find("//mets:fileSec", namespaces=XMLNS).append(file_grp_best) + + # Check image dimensions if FULLTEXT exists + file_grp_fulltext = mets.find('//mets:fileGrp[@USE="FULLTEXT"]', namespaces=XMLNS) + if file_grp_fulltext is not None: + # Collect all file IDs for every file group + file_ids_for_group = defaultdict(list) + for file_grp in mets.findall("//mets:fileGrp", namespaces=XMLNS): + for file_ in file_grp.findall("mets:file", namespaces=XMLNS): + print(file_.attrib["ID"]) + file_ids_for_group[file_grp.attrib["USE"]].append(file_.attrib["ID"]) + + for page in mets.findall( + '//mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=XMLNS, + ): + print() + for fptr in page.findall("mets:fptr", namespaces=XMLNS): + print(fptr) + # XXX + # For every TYPE="page" in : + # Check image dimenstion of + # + # + # (need to find FILEID for pageid in given file group) # Write mets.xml mets.write('mets.xml', pretty_print=True)