mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-09 22:59:53 +02:00
✨ Run Calamari OCR
This commit is contained in:
parent
001e62f54a
commit
0bc06c2fad
7 changed files with 29 additions and 1 deletions
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
[submodule "data"]
|
||||||
|
path = data
|
||||||
|
url = git@code.dev.sbb.berlin:qurator/qurator-data.git
|
|
@ -2,6 +2,9 @@ FROM ubuntu:18.04
|
||||||
|
|
||||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||||
|
|
||||||
|
RUN mkdir /var/lib/calamari-models
|
||||||
|
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
git \
|
git \
|
||||||
|
|
8
build
8
build
|
@ -1,2 +1,10 @@
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(
|
||||||
|
cd data
|
||||||
|
git annex get calamari-models/GT4HistOCR
|
||||||
|
git annex unlock calamari-models/GT4HistOCR # So they are not symlinks
|
||||||
|
)
|
||||||
|
|
||||||
docker build -t my_ocrd_workflow .
|
docker build -t my_ocrd_workflow .
|
||||||
|
|
1
data
Submodule
1
data
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 9af6a6c6c882b1dfdbf1efba7b1f555638c2ffd5
|
|
@ -81,6 +81,14 @@ do_ocr() {
|
||||||
-p <(echo $ocrd_tesserocr_recognize_parameters)
|
-p <(echo $ocrd_tesserocr_recognize_parameters)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
do_ocr_calamari() {
|
||||||
|
ocrd_calamari_recognize_parameters='{ "checkpoint": "/var/lib/calamari-models/GT4HistOCR/*.ckpt.json" }'
|
||||||
|
remove_filegrp OCR-D-OCR-CALAMARI mets.xml
|
||||||
|
ocrd-calamari-recognize -l $LOG_LEVEL \
|
||||||
|
-m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \
|
||||||
|
-p <(echo $ocrd_calamari_recognize_parameters)
|
||||||
|
}
|
||||||
|
|
||||||
page_validate_xml() {
|
page_validate_xml() {
|
||||||
# Validate all PAGE XML against the XML schema
|
# Validate all PAGE XML against the XML schema
|
||||||
|
|
||||||
|
@ -141,6 +149,8 @@ page_validate_xml OCR-D-SEG-REGION
|
||||||
page_validate_xml OCR-D-SEG-LINE
|
page_validate_xml OCR-D-SEG-LINE
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
do_ocr_calamari
|
||||||
|
|
||||||
do_ocr
|
do_ocr
|
||||||
page_validate_xml OCR-D-OCR-TESS
|
page_validate_xml OCR-D-OCR-TESS
|
||||||
page_workaround_remove_conf OCR-D-OCR-TESS
|
page_workaround_remove_conf OCR-D-OCR-TESS
|
||||||
|
|
|
@ -8,4 +8,7 @@ https://github.com/mikegerber/ocrd_kraken/archive/fix/pass-down-page-id.tar.gz
|
||||||
tesserocr == 2.3.1 # 2.4.0 fails with Ubuntu 18.04's tesseract
|
tesserocr == 2.3.1 # 2.4.0 fails with Ubuntu 18.04's tesseract
|
||||||
ocrd_tesserocr
|
ocrd_tesserocr
|
||||||
|
|
||||||
|
setuptools >= 41.0.0 # FIXME tensorboard seems to depend on this, but why do we get an error at runtime?
|
||||||
|
https://github.com/mikegerber/ocrd_calamari/archive/250a24d.tar.gz
|
||||||
|
|
||||||
https://github.com/qurator-spk/dinglehopper/archive/0f056b9.tar.gz
|
https://github.com/qurator-spk/dinglehopper/archive/0f056b9.tar.gz
|
||||||
|
|
2
run
2
run
|
@ -1,3 +1,3 @@
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# XXX Do not run privileged, use udica instead
|
# XXX Do not run privileged, use udica instead
|
||||||
docker run -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow
|
docker run --gpus all -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue