From 0bc06c2fad2541345a62854d826307fe52ac321a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 21 Aug 2019 11:54:01 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Run=20Calamari=20OCR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitmodules | 3 +++ Dockerfile | 3 +++ build | 8 ++++++++ data | 1 + my_ocrd_workflow | 10 ++++++++++ requirements.txt | 3 +++ run | 2 +- 7 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 .gitmodules create mode 160000 data diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a08c53c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "data"] + path = data + url = git@code.dev.sbb.berlin:qurator/qurator-data.git diff --git a/Dockerfile b/Dockerfile index 5e85272..2a21dfb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,9 @@ FROM ubuntu:18.04 ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 +RUN mkdir /var/lib/calamari-models +COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR + RUN apt-get update && \ apt-get install -y \ git \ diff --git a/build b/build index 657e02c..4037885 100755 --- a/build +++ b/build @@ -1,2 +1,10 @@ #!/bin/sh +set -e + +( + cd data + git annex get calamari-models/GT4HistOCR + git annex unlock calamari-models/GT4HistOCR # So they are not symlinks +) + docker build -t my_ocrd_workflow . diff --git a/data b/data new file mode 160000 index 0000000..9af6a6c --- /dev/null +++ b/data @@ -0,0 +1 @@ +Subproject commit 9af6a6c6c882b1dfdbf1efba7b1f555638c2ffd5 diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 7e8612e..c7fe67e 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -81,6 +81,14 @@ do_ocr() { -p <(echo $ocrd_tesserocr_recognize_parameters) } +do_ocr_calamari() { + ocrd_calamari_recognize_parameters='{ "checkpoint": "/var/lib/calamari-models/GT4HistOCR/*.ckpt.json" }' + remove_filegrp OCR-D-OCR-CALAMARI mets.xml + ocrd-calamari-recognize -l $LOG_LEVEL \ + -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \ + -p <(echo $ocrd_calamari_recognize_parameters) +} + page_validate_xml() { # Validate all PAGE XML against the XML schema @@ -141,6 +149,8 @@ page_validate_xml OCR-D-SEG-REGION page_validate_xml OCR-D-SEG-LINE do_validate +do_ocr_calamari + do_ocr page_validate_xml OCR-D-OCR-TESS page_workaround_remove_conf OCR-D-OCR-TESS diff --git a/requirements.txt b/requirements.txt index 0e6b298..384e562 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,7 @@ https://github.com/mikegerber/ocrd_kraken/archive/fix/pass-down-page-id.tar.gz tesserocr == 2.3.1 # 2.4.0 fails with Ubuntu 18.04's tesseract ocrd_tesserocr +setuptools >= 41.0.0 # FIXME tensorboard seems to depend on this, but why do we get an error at runtime? +https://github.com/mikegerber/ocrd_calamari/archive/250a24d.tar.gz + https://github.com/qurator-spk/dinglehopper/archive/0f056b9.tar.gz diff --git a/run b/run index 46d7a33..2864004 100755 --- a/run +++ b/run @@ -1,3 +1,3 @@ #!/bin/sh # XXX Do not run privileged, use udica instead -docker run -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow +docker run --gpus all -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow