mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-09 14:49:53 +02:00
✨ Run Calamari OCR
This commit is contained in:
parent
001e62f54a
commit
0bc06c2fad
7 changed files with 29 additions and 1 deletions
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "data"]
|
||||
path = data
|
||||
url = git@code.dev.sbb.berlin:qurator/qurator-data.git
|
|
@ -2,6 +2,9 @@ FROM ubuntu:18.04
|
|||
|
||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||
|
||||
RUN mkdir /var/lib/calamari-models
|
||||
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
|
|
8
build
8
build
|
@ -1,2 +1,10 @@
|
|||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
(
|
||||
cd data
|
||||
git annex get calamari-models/GT4HistOCR
|
||||
git annex unlock calamari-models/GT4HistOCR # So they are not symlinks
|
||||
)
|
||||
|
||||
docker build -t my_ocrd_workflow .
|
||||
|
|
1
data
Submodule
1
data
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 9af6a6c6c882b1dfdbf1efba7b1f555638c2ffd5
|
|
@ -81,6 +81,14 @@ do_ocr() {
|
|||
-p <(echo $ocrd_tesserocr_recognize_parameters)
|
||||
}
|
||||
|
||||
do_ocr_calamari() {
|
||||
ocrd_calamari_recognize_parameters='{ "checkpoint": "/var/lib/calamari-models/GT4HistOCR/*.ckpt.json" }'
|
||||
remove_filegrp OCR-D-OCR-CALAMARI mets.xml
|
||||
ocrd-calamari-recognize -l $LOG_LEVEL \
|
||||
-m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \
|
||||
-p <(echo $ocrd_calamari_recognize_parameters)
|
||||
}
|
||||
|
||||
page_validate_xml() {
|
||||
# Validate all PAGE XML against the XML schema
|
||||
|
||||
|
@ -141,6 +149,8 @@ page_validate_xml OCR-D-SEG-REGION
|
|||
page_validate_xml OCR-D-SEG-LINE
|
||||
do_validate
|
||||
|
||||
do_ocr_calamari
|
||||
|
||||
do_ocr
|
||||
page_validate_xml OCR-D-OCR-TESS
|
||||
page_workaround_remove_conf OCR-D-OCR-TESS
|
||||
|
|
|
@ -8,4 +8,7 @@ https://github.com/mikegerber/ocrd_kraken/archive/fix/pass-down-page-id.tar.gz
|
|||
tesserocr == 2.3.1 # 2.4.0 fails with Ubuntu 18.04's tesseract
|
||||
ocrd_tesserocr
|
||||
|
||||
setuptools >= 41.0.0 # FIXME tensorboard seems to depend on this, but why do we get an error at runtime?
|
||||
https://github.com/mikegerber/ocrd_calamari/archive/250a24d.tar.gz
|
||||
|
||||
https://github.com/qurator-spk/dinglehopper/archive/0f056b9.tar.gz
|
||||
|
|
2
run
2
run
|
@ -1,3 +1,3 @@
|
|||
#!/bin/sh
|
||||
# XXX Do not run privileged, use udica instead
|
||||
docker run -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow
|
||||
docker run --gpus all -it --rm --mount type=bind,src="$(pwd)",target=/data --privileged=true my_ocrd_workflow
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue