From bb703152db47a4f522d581f240822643c1516eac Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 23 Jun 2020 15:15:21 +0200 Subject: [PATCH 1/4] =?UTF-8?q?=F0=9F=90=9B=20ppn2ocr:=20Verify=20oai.sbb.?= =?UTF-8?q?berlin's=20certificate=20again?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that oai.sbb.berlin's certificate chain is fixed, remove the workaround again. Fixes GH#15. --- ppn2ocr | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index 242a8d8..7258608 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -32,9 +32,7 @@ def oai_mets(ppn): } s = requests.Session() - # FIXME oai.sbb.berlin fails certificate check - #r = s.get(API_URL, params=params) - r = s.get(API_URL, params=params, verify=False) + r = s.get(API_URL, params=params) mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets") mets = ET.ElementTree(mets) From f7b43bbefae7edada00cef1322a5d929c6d6f21b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 23 Jun 2020 19:03:58 +0200 Subject: [PATCH 2/4] =?UTF-8?q?=E2=9C=A8=20ppn2ocr:=20Support=20TIFF=20in?= =?UTF-8?q?=20the=20BEST=20group?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index bcdc8bf..d24a463 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -39,7 +39,7 @@ def oai_mets(ppn): return mets -def iiif_url_for_dms_url(dms_url, ppn, size): +def iiif_url_for_dms_url(dms_url, ppn, size, format): """ Construct an IIIF URL from a dms URL. @@ -53,7 +53,8 @@ def iiif_url_for_dms_url(dms_url, ppn, size): else: raise ValueError(f"Unexpected URL {dms_url}") iiif_identifier = f'{ppn}-{page_num}' - iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/default.jpg' + iiif_quality = 'default' + iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}' return iiif_url @@ -66,6 +67,17 @@ def remove_file_grp(mets, use): bad.getparent().remove(bad) +def mime_type_for_format(format_): + if format_ == 'tif': + mime_type = 'image/tiff' + elif format_ == 'jpg': + mime_type = 'image/jpg' + else: + raise ValueError() + + return mime_type + + def make_workspace(ppn, workspace): # Make workspace directory os.mkdir(workspace) @@ -81,6 +93,7 @@ def make_workspace(ppn, workspace): # Duplicate DEFAULT file group into a new file group BEST + format_ = 'tif' file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS) file_grp_best = deepcopy(file_grp_default) @@ -89,6 +102,7 @@ def make_workspace(ppn, workspace): old_id = f.attrib['ID'] new_id = re.sub('DEFAULT', 'BEST', old_id) f.attrib['ID'] = new_id + f.attrib['MIMETYPE'] = mime_type_for_format(format_) for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS): new_fptr = deepcopy(fptr) @@ -98,7 +112,7 @@ def make_workspace(ppn, workspace): # XXX Need to fumble around with the URL for now flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat") old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"] - url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full') + url_iiif_full = iiif_url_for_dms_url(old_url, ppn, 'full', format_) flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best) From ef3a8a69e054f633076d98e8f2bb0987ab067a6e Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 29 Jul 2020 16:45:21 +0200 Subject: [PATCH 3/4] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Update=20ocrd=5Folena?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a6a259f..84de53b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM ubuntu:18.04 ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 +ENV OCRD_OLENA_VERSION 1.1.10 ENV TESSDATA_BEST_VERSION 4.0.0 ENV TESSDATA_PREFIX /usr/local/share/tessdata @@ -47,7 +48,7 @@ RUN curl -sSL -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena apt-get -f install -y && \ apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip3 install --no-cache-dir --upgrade pip && \ - curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v1.1.4.tar.gz && \ + curl -sSL -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ mkdir ocrd_olena && \ tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ cd ocrd_olena && \ From 7fe2ce84b5ea208bdbf40135ecd51d4827e09fc4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 29 Jul 2020 16:46:00 +0200 Subject: [PATCH 4/4] =?UTF-8?q?=F0=9F=90=9B=20Update=20sbb=5Ftextline=5Fde?= =?UTF-8?q?tector=20to=20fix=20Keras/TF=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index df1f69a..9ea09f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ ocrd_tesserocr >= 0.8.1 ocrd_calamari >= 0.0.6 -https://github.com/qurator-spk/sbb_textline_detector/archive/8618be2.tar.gz +https://github.com/qurator-spk/sbb_textline_detector/archive/4036e2a5.tar.gz https://github.com/qurator-spk/dinglehopper/archive/745095e.tar.gz