Run Calamari OCR

pull/27/head
Gerber, Mike 5 years ago
parent 001e62f54a
commit 0bc06c2fad

3
.gitmodules vendored

@ -0,0 +1,3 @@
[submodule "data"]
path = data
url = git@code.dev.sbb.berlin:qurator/qurator-data.git

@ -2,6 +2,9 @@ FROM ubuntu:18.04
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
RUN mkdir /var/lib/calamari-models
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y \
git \ git \

@ -1,2 +1,10 @@
#!/bin/sh #!/bin/sh
set -e
(
cd data
git annex get calamari-models/GT4HistOCR
git annex unlock calamari-models/GT4HistOCR # So they are not symlinks
)
docker build -t my_ocrd_workflow . docker build -t my_ocrd_workflow .

@ -0,0 +1 @@
Subproject commit 9af6a6c6c882b1dfdbf1efba7b1f555638c2ffd5

@ -81,6 +81,14 @@ do_ocr() {
-p <(echo $ocrd_tesserocr_recognize_parameters) -p <(echo $ocrd_tesserocr_recognize_parameters)
} }
do_ocr_calamari() {
ocrd_calamari_recognize_parameters='{ "checkpoint": "/var/lib/calamari-models/GT4HistOCR/*.ckpt.json" }'
remove_filegrp OCR-D-OCR-CALAMARI mets.xml
ocrd-calamari-recognize -l $LOG_LEVEL \
-m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \
-p <(echo $ocrd_calamari_recognize_parameters)
}
page_validate_xml() { page_validate_xml() {
# Validate all PAGE XML against the XML schema # Validate all PAGE XML against the XML schema
@ -141,6 +149,8 @@ page_validate_xml OCR-D-SEG-REGION
page_validate_xml OCR-D-SEG-LINE page_validate_xml OCR-D-SEG-LINE
do_validate do_validate
do_ocr_calamari
do_ocr do_ocr
page_validate_xml OCR-D-OCR-TESS page_validate_xml OCR-D-OCR-TESS
page_workaround_remove_conf OCR-D-OCR-TESS page_workaround_remove_conf OCR-D-OCR-TESS

@ -8,4 +8,7 @@ https://github.com/mikegerber/ocrd_kraken/archive/fix/pass-down-page-id.tar.gz
tesserocr == 2.3.1 # 2.4.0 fails with Ubuntu 18.04's tesseract tesserocr == 2.3.1 # 2.4.0 fails with Ubuntu 18.04's tesseract
ocrd_tesserocr ocrd_tesserocr
setuptools >= 41.0.0 # FIXME tensorboard seems to depend on this, but why do we get an error at runtime?
https://github.com/mikegerber/ocrd_calamari/archive/250a24d.tar.gz
https://github.com/qurator-spk/dinglehopper/archive/0f056b9.tar.gz https://github.com/qurator-spk/dinglehopper/archive/0f056b9.tar.gz

2
run

@ -1,3 +1,3 @@
#!/bin/sh #!/bin/sh
# XXX Do not run privileged, use udica instead # XXX Do not run privileged, use udica instead
docker run -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow docker run --gpus all -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow

Loading…
Cancel
Save