diff --git a/Dockerfile b/Dockerfile index a6a259f..84de53b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM ubuntu:18.04 ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 +ENV OCRD_OLENA_VERSION 1.1.10 ENV TESSDATA_BEST_VERSION 4.0.0 ENV TESSDATA_PREFIX /usr/local/share/tessdata @@ -47,7 +48,7 @@ RUN curl -sSL -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena apt-get -f install -y && \ apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip3 install --no-cache-dir --upgrade pip && \ - curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v1.1.4.tar.gz && \ + curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ mkdir ocrd_olena && \ tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ cd ocrd_olena && \ diff --git a/ppn2ocr b/ppn2ocr index 5f20dcb..d24a463 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -32,16 +32,14 @@ def oai_mets(ppn): } s = requests.Session() - # FIXME oai.sbb.berlin fails certificate check - #r = s.get(API_URL, params=params) - r = s.get(API_URL, params=params, verify=False) + r = s.get(API_URL, params=params) mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets") mets = ET.ElementTree(mets) return mets -def iiif_url_for_dms_url(dms_url, ppn, size): +def iiif_url_for_dms_url(dms_url, ppn, size, format): """ Construct an IIIF URL from a dms URL. @@ -55,7 +53,8 @@ def iiif_url_for_dms_url(dms_url, ppn, size): else: raise ValueError(f"Unexpected URL {dms_url}") iiif_identifier = f'{ppn}-{page_num}' - iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/default.jpg' + iiif_quality = 'default' + iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}' return iiif_url @@ -68,6 +67,17 @@ def remove_file_grp(mets, use): bad.getparent().remove(bad) +def mime_type_for_format(format_): + if format_ == 'tif': + mime_type = 'image/tiff' + elif format_ == 'jpg': + mime_type = 'image/jpg' + else: + raise ValueError() + + return mime_type + + def make_workspace(ppn, workspace): # Make workspace directory os.mkdir(workspace) @@ -83,6 +93,7 @@ def make_workspace(ppn, workspace): # Duplicate DEFAULT file group into a new file group BEST + format_ = 'tif' file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS) file_grp_best = deepcopy(file_grp_default) @@ -91,6 +102,7 @@ def make_workspace(ppn, workspace): old_id = f.attrib['ID'] new_id = re.sub('DEFAULT', 'BEST', old_id) f.attrib['ID'] = new_id + f.attrib['MIMETYPE'] = mime_type_for_format(format_) for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS): new_fptr = deepcopy(fptr) @@ -100,7 +112,7 @@ def make_workspace(ppn, workspace): # XXX Need to fumble around with the URL for now flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat") old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"] - url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full') + url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_) flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best) diff --git a/requirements.txt b/requirements.txt index df1f69a..9ea09f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ ocrd_tesserocr >= 0.8.1 ocrd_calamari >= 0.0.6 -https://github.com/qurator-spk/sbb_textline_detector/archive/8618be2.tar.gz +https://github.com/qurator-spk/sbb_textline_detector/archive/4036e2a5.tar.gz https://github.com/qurator-spk/dinglehopper/archive/745095e.tar.gz