From fec7aa5f00e3e1061070b22f349f9a6052368ece Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 27 Jul 2021 12:32:12 +0200 Subject: [PATCH 1/7] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20ocrd-galley:=20eynolla?= =?UTF-8?q?h=200.0.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-eynollah | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile-eynollah b/Dockerfile-eynollah index 335344a..508db57 100644 --- a/Dockerfile-eynollah +++ b/Dockerfile-eynollah @@ -2,7 +2,7 @@ ARG DRONE_COMMIT="latest" FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" -ARG EYNOLLAH_VERSION="0.0.6" +ARG EYNOLLAH_VERSION="0.0.7" # Build pip installable stuff From edeb870afa55245f59058c53d412d6d1dda62ea5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 28 Jul 2021 14:37:53 +0200 Subject: [PATCH 2/7] =?UTF-8?q?=E2=AC=86=20eynollah=200.0.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-eynollah | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile-eynollah b/Dockerfile-eynollah index 508db57..20f62f6 100644 --- a/Dockerfile-eynollah +++ b/Dockerfile-eynollah @@ -2,7 +2,7 @@ ARG DRONE_COMMIT="latest" FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" -ARG EYNOLLAH_VERSION="0.0.7" +ARG EYNOLLAH_VERSION="0.0.8" # Build pip installable stuff From 6ae4bc8e3ace3bbc67b401acecba328891e9a05d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Sep 2021 17:11:12 +0200 Subject: [PATCH 3/7] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20ppn2ocr:=20Use=20new?= =?UTF-8?q?=20API=5FURL=20(https://oai.sbb.berlin)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppn2ocr b/ppn2ocr index af6540b..d880a08 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -14,7 +14,7 @@ XMLNS = { 'mets': 'http://www.loc.gov/METS/', 'xlink': 'http://www.w3.org/1999/xlink' } -API_URL = 'https://digital.staatsbibliothek-berlin.de/oai' +API_URL = 'https://oai.sbb.berlin' IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s' From 91296ffa0e09dd1e21b36ef1c1df09d248e3445b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Sep 2021 17:12:11 +0200 Subject: [PATCH 4/7] =?UTF-8?q?=E2=9A=99=EF=B8=8F=20ppn2ocr:=20Move=20prun?= =?UTF-8?q?ing=20file=20groups=20into=20a=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index d880a08..a152acc 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -78,6 +78,14 @@ def mime_type_for_format(format_): return mime_type +def prune_file_grps(mets): + # XXX + # Delete PRESENTATION + LOCAL file groups + # (local file:/// or file:/ links, not handled well by "ocrd workspace") + remove_file_grp(mets, 'PRESENTATION') + remove_file_grp(mets, 'LOCAL') + + def make_workspace(ppn, workspace): # Make workspace directory os.mkdir(workspace) @@ -85,11 +93,6 @@ def make_workspace(ppn, workspace): mets = oai_mets(ppn) - # XXX - # Delete PRESENTATION + LOCAL file groups - # (local file:/// or file:/ links, not handled well by "ocrd workspace") - remove_file_grp(mets, 'PRESENTATION') - remove_file_grp(mets, 'LOCAL') # Delete MAX file group - we assume that, if it exists, it is not as @@ -125,6 +128,10 @@ def make_workspace(ppn, workspace): mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best) + + prune_file_grps(mets) + + # Write mets.xml mets.write('mets.xml', pretty_print=True) From f197f01d3f7f0366f43791e5eb47ce3103f738cd Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Sep 2021 17:26:14 +0200 Subject: [PATCH 5/7] =?UTF-8?q?=E2=9C=A8=20ppn2ocr:=20Keep=20only=20wanted?= =?UTF-8?q?=20file=20groups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index a152acc..cbd0030 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -79,11 +79,22 @@ def mime_type_for_format(format_): def prune_file_grps(mets): - # XXX - # Delete PRESENTATION + LOCAL file groups - # (local file:/// or file:/ links, not handled well by "ocrd workspace") - remove_file_grp(mets, 'PRESENTATION') - remove_file_grp(mets, 'LOCAL') + """ + Prune unwanted file groups + + We only want to keep the MAX file group (we created it ourselves) and + possibly ABBYY full texts in FULLTEXT. + + For the PRESENTATION + LOCAL file groups we definitely want to delete + because they contain local file:/// or file:/ links, which are not handled + well by "ocrd workspace". They are not explicitly mentioned, as we + only keep a whitelist. + """ + wanted_file_grps = ["MAX", "FULLTEXT"] + + for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS): + if u not in wanted_file_grps: + remove_file_grp(mets, u) def make_workspace(ppn, workspace): From 9a2cfa35d119be9e0351c8ef442b1b8d6aaf8a95 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Sep 2021 17:37:31 +0200 Subject: [PATCH 6/7] =?UTF-8?q?=F0=9F=8E=A8=20ppn2ocr:=20Fix=20bad=20inden?= =?UTF-8?q?tation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index cbd0030..b43e227 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -11,8 +11,8 @@ from copy import deepcopy XMLNS = { - 'mets': 'http://www.loc.gov/METS/', - 'xlink': 'http://www.w3.org/1999/xlink' + 'mets': 'http://www.loc.gov/METS/', + 'xlink': 'http://www.w3.org/1999/xlink' } API_URL = 'https://oai.sbb.berlin' IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s' @@ -93,8 +93,8 @@ def prune_file_grps(mets): wanted_file_grps = ["MAX", "FULLTEXT"] for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS): - if u not in wanted_file_grps: - remove_file_grp(mets, u) + if u not in wanted_file_grps: + remove_file_grp(mets, u) def make_workspace(ppn, workspace): @@ -115,7 +115,7 @@ def make_workspace(ppn, workspace): file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS) if file_grp_default is None: - raise ValueError("This document has no DEFAULT file group, could be a multi-volume work") + raise ValueError("This document has no DEFAULT file group, could be a multi-volume work") file_grp_best = deepcopy(file_grp_default) From c65dbf9b1f356b5123f3a8133a5f87c13157d69b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Sep 2021 17:45:26 +0200 Subject: [PATCH 7/7] =?UTF-8?q?=F0=9F=A7=B9=20ppn2ocr:=20Remove=20obsolete?= =?UTF-8?q?=20comments=20re=20file:=20URLs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index b43e227..140a8a3 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -150,13 +150,6 @@ def make_workspace(ppn, workspace): # Validate workspace #ocrd workspace validate mets.xml | grep -v "Won't download remote image" - # XXX - # Fix 'file:/' URLs to 'file:///' - #sed -i 's#file:/\([^/]\)#file:///\1#' mets.xml - - # Patch mets.xml to use our NFS mount - #sed -i 's#file:///goobi/tiff001/sbb/#file:///srv/digisam_images/sbb/#g' mets.xml - def validate_ppn(ctx, param, value): """Validate a PPN argument"""