diff --git a/Dockerfile-sbb_binarization b/Dockerfile-sbb_binarization new file mode 100644 index 0000000..bd5f70e --- /dev/null +++ b/Dockerfile-sbb_binarization @@ -0,0 +1,22 @@ +FROM my_ocrd_workflow-core + +ARG PIP_INSTALL="pip3 install --no-cache-dir --use-feature=2020-resolver" +ARG SBB_BINARIZATION_COMMIT="3e60a62" + + +# Build pip installable stuff +RUN ${PIP_INSTALL} \ +# Now the real stuff: + https://github.com/qurator-spk/sbb_binarization/archive/$SBB_BINARIZATION_COMMIT.tar.gz + + +# Copy models +COPY data/sbb_binarization /var/lib/sbb_binarization + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-sbb-binarize"] diff --git a/build b/build index 17fdee8..efe5467 100755 --- a/build +++ b/build @@ -10,11 +10,13 @@ get_from_annex() { annex_get 'tesseract-models/GT4HistOCR/*.traineddata' annex_get 'textline_detection/*.h5' annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' + annex_get 'sbb_binarization/*.h5' } get_from_web() { download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200' download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR' download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection' + download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/models.tar.gz' 'sbb_binarization' download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' } . $self_dir/qurator_data_lib.sh @@ -27,3 +29,4 @@ docker build --cache-from=my_ocrd_workflow-dinglehopper -t my_ocrd_work docker build --cache-from=my_ocrd_workflow-ocrd_olena -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena . docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr . docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector . +docker build --cache-from=my_ocrd_workflow-sbb_binarization -t my_ocrd_workflow-sbb_binarization -f Dockerfile-sbb_binarization . diff --git a/data b/data index 0cc7846..bd1628e 160000 --- a/data +++ b/data @@ -1 +1 @@ -Subproject commit 0cc78464e74fd295f00ee28a6f605ce0856d3db0 +Subproject commit bd1628e1aefbb0766655ee3726b4b0234d20a4f1 diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 805b044..679f9a3 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -58,6 +58,7 @@ main() { do_validate + #ocrd-sbb-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization" ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split" do_validate diff --git a/qurator_data_lib.sh b/qurator_data_lib.sh index c543dff..a307d14 100644 --- a/qurator_data_lib.sh +++ b/qurator_data_lib.sh @@ -62,13 +62,37 @@ annex_get() { ) } +# Options: +# --no-unpack Do NOT unpack the file +# --strip-components NUMBER (as tar's option) download_to() { unpack=1 - if [[ "$1" = '--no-unpack' ]]; then - unpack=0 - shift + tar_options="" + + _options=$(getopt --long no-unpack,strip-components: -- "" "$@") + if [[ $? != 0 ]]; then + echo "Bad parameters for download_to" >&2 + exit 1 fi - file_pattern="$1" + eval set -- "$_options" + while true; do + case "$1" in + --no-unpack) + unpack=0 + ;; + --strip-components) + shift + components=$1 + tar_options="$tar_options --strip-components $components" + ;; + --) + shift + break + ;; + esac + shift + done + download_source="$1" dest="$2" @@ -79,7 +103,7 @@ download_to() { if [[ $unpack = 1 ]]; then mkdir -p "$dest" # Unpacking relies on tar -a unpacking any tar compression - tar -C "$dest" -af $tmpf -xv + tar -C "$dest" $tar_options -af $tmpf -xv rm -f $tmpf else dest_dir=`dirname "$dest"` diff --git a/run b/run index 01fd03c..5b6c3da 100755 --- a/run +++ b/run @@ -37,6 +37,7 @@ build_alias() { shopt -s expand_aliases # Required for non-interactive shells build_alias ocrd ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG} build_alias ocrd-olena-binarize ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG} +build_alias ocrd-sbb-binarize ${DOCKER_IMAGE_PREFIX}-sbb_binarization:${DOCKER_IMAGE_TAG} build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG} build_alias ocrd-calamari-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG} build_alias ocrd-tesserocr-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG}