From 448bf9e25642affe38aed964a18db1cbc6a5f762 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 4 Jun 2020 19:55:00 +0200 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=90=9B=20ppn2ocr:=20Remove=20LOCAL=20?= =?UTF-8?q?file=20group=20too?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index c21e170..db1ee06 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -87,9 +87,10 @@ def make_workspace(ppn, workspace): mets = oai_mets(ppn) # XXX - # Delete PRESENTATION file group - # (local file:/// links, not handled well by "ocrd workspace") + # Delete PRESENTATION + LOCAL file groups + # (local file:/// or file:/ links, not handled well by "ocrd workspace") remove_file_grp(mets, 'PRESENTATION') + remove_file_grp(mets, 'LOCAL') # Duplicate DEFAULT file group into a new file group BEST From 0aa541fa186a71df535880bc733cd7990d89c94c Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 18 Jun 2020 14:46:01 +0200 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=93=20README:=20Reference=20howto/?= =?UTF-8?q?*proxy*.md=20instead=20of=20duplicating=20the=20proxy=20setting?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d7588c9..5af376f 100644 --- a/README.md +++ b/README.md @@ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it; the results are viewable as explained above. ppn2ocr requires a working Docker setup and properly set up environment -variables for the proxy configuration. At SBB, this means: -~~~ -export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/ -export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY -export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de -~~~ +variables for the proxy configuration. At SBB, this following +`howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md` +(in qurator's mono-repo). From c334b1e7ac89f0cb5ee26e85acb6df7779b59542 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 19 Jun 2020 16:01:07 +0200 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=A7=B9=20Move=20check-FULLTEXT-Page-d?= =?UTF-8?q?imensions-vs-BEST-dimensions.py=20code=20to=20mono-repo/experim?= =?UTF-8?q?ents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...TEXT-Page-dimensions-vs-BEST-dimensions.py | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100644 check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py diff --git a/check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py b/check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py deleted file mode 100644 index d332214..0000000 --- a/check-FULLTEXT-Page-dimensions-vs-BEST-dimensions.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Check FULLTEXT ALTO page dimensions against BEST image dimensions""" - -import PIL.Image -import sys -from ocrd.workspace import Workspace -from ocrd.resolver import Resolver -from lxml import etree as ET - - -def alto_namespace(tree): - """ - Return the ALTO namespace used in the given ElementTree. - - This relies on the assumption that, in any given ALTO file, the root - element has the local name "alto". We do not check if the files uses any - valid ALTO namespace. - """ - root_name = ET.QName(tree.getroot().tag) - if root_name.localname == 'alto': - return root_name.namespace - else: - raise ValueError('Not an ALTO tree') - - -exit_code = 0 -workspace = Workspace(Resolver(), '.') - -for n, page_id in enumerate(workspace.mets.physical_pages): - gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0] - img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0] - gt_file = workspace.download_file(gt_file) - img_file = workspace.download_file(img_file) - - tree = ET.parse(gt_file.local_filename) - nsmap = {'alto': alto_namespace(tree)} - alto_page = tree.find('//alto:Page', namespaces=nsmap) # one page assumed - gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT']) - - img_size = PIL.Image.open(img_file.local_filename).size - - if gt_size == img_size: - print('OK', page_id) - else: - print('ERR', page_id, gt_size, '!=', img_size) - exit_code = 1 - -sys.exit(exit_code) From f7b0b4121dd7c47215d7b8d04b3c8e7c3ff0183d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 19 Jun 2020 17:47:35 +0200 Subject: [PATCH 4/5] =?UTF-8?q?=E2=81=89=EF=B8=8F=20Check=20dependencies?= =?UTF-8?q?=20using=20pipdeptree=20to=20triage=20Travis=20build=20fail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0816a86..a6a259f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection # Install requirements +# Using pipdeptree here to get more info than from pip3 check COPY requirements.txt /tmp/ RUN pip3 install --no-cache-dir --upgrade pip && \ pip3 install --no-cache-dir -r /tmp/requirements.txt && \ - pip3 check + pip3 install --no-cache-dir pipdeptree && \ + pipdeptree -w fail + COPY my_ocrd_workflow /usr/bin/ COPY xsd/* /usr/share/xml/ From c5ae23d2efcc81847bd689a6a31187d07176f858 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 19 Jun 2020 19:27:32 +0200 Subject: [PATCH 5/5] =?UTF-8?q?=E2=9C=A8=20Validate=20before=20even=20star?= =?UTF-8?q?ting,=20to=20find=20data=20problems?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my_ocrd_workflow | 2 ++ 1 file changed, 2 insertions(+) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index eab0c46..86b59a1 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -134,6 +134,8 @@ page_upgrade_to_2019() { main() { + do_validate + do_binarization do_validate