From f7b43bbefae7edada00cef1322a5d929c6d6f21b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 23 Jun 2020 19:03:58 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20ppn2ocr:=20Support=20TIFF=20in=20th?= =?UTF-8?q?e=20BEST=20group?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index bcdc8bf..d24a463 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -39,7 +39,7 @@ def oai_mets(ppn): return mets -def iiif_url_for_dms_url(dms_url, ppn, size): +def iiif_url_for_dms_url(dms_url, ppn, size, format): """ Construct an IIIF URL from a dms URL. @@ -53,7 +53,8 @@ def iiif_url_for_dms_url(dms_url, ppn, size): else: raise ValueError(f"Unexpected URL {dms_url}") iiif_identifier = f'{ppn}-{page_num}' - iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/default.jpg' + iiif_quality = 'default' + iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}' return iiif_url @@ -66,6 +67,17 @@ def remove_file_grp(mets, use): bad.getparent().remove(bad) +def mime_type_for_format(format_): + if format_ == 'tif': + mime_type = 'image/tiff' + elif format_ == 'jpg': + mime_type = 'image/jpg' + else: + raise ValueError() + + return mime_type + + def make_workspace(ppn, workspace): # Make workspace directory os.mkdir(workspace) @@ -81,6 +93,7 @@ def make_workspace(ppn, workspace): # Duplicate DEFAULT file group into a new file group BEST + format_ = 'tif' file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS) file_grp_best = deepcopy(file_grp_default) @@ -89,6 +102,7 @@ def make_workspace(ppn, workspace): old_id = f.attrib['ID'] new_id = re.sub('DEFAULT', 'BEST', old_id) f.attrib['ID'] = new_id + f.attrib['MIMETYPE'] = mime_type_for_format(format_) for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS): new_fptr = deepcopy(fptr) @@ -98,7 +112,7 @@ def make_workspace(ppn, workspace): # XXX Need to fumble around with the URL for now flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat") old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"] - url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full') + url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_) flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)