🚧 Prepare supporting ocrd-sbb-binarize

ocrd-sbb-binarize seems to work but its input does not work with ocrd-sbb-textline-detector: https://github.com/qurator-spk/sbb_binarization/issues/8 https://github.com/qurator-spk/sbb_textline_detection/issues/47
2025-07-27 13:49:53 +02:00 · 2020-10-22 21:08:13 +02:00 · 2020-10-22 21:08:13 +02:00 · 0841af5491
commit 0841af5491
parent 053fc0bc34
6 changed files with 57 additions and 6 deletions
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+FROM my_ocrd_workflow-core
+
+ARG PIP_INSTALL="pip3 install --no-cache-dir --use-feature=2020-resolver"
+ARG SBB_BINARIZATION_COMMIT="3e60a62"
+
+
+# Build pip installable stuff
+RUN ${PIP_INSTALL} \
+# Now the real stuff:
+    https://github.com/qurator-spk/sbb_binarization/archive/$SBB_BINARIZATION_COMMIT.tar.gz
+
+
+# Copy models
+COPY data/sbb_binarization /var/lib/sbb_binarization
+
+
+# Check pip dependencies
+RUN pip3 check
+
+
+# Default command
+CMD ["ocrd-sbb-binarize"]
--- a/3
+++ b/3
@ -10,11 +10,13 @@ get_from_annex() {
  annex_get 'tesseract-models/GT4HistOCR/*.traineddata'
  annex_get 'textline_detection/*.h5'
  annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
+  annex_get 'sbb_binarization/*.h5'
 }
 get_from_web() {
  download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
  download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar'  'tesseract-models/GT4HistOCR'
  download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz'     'textline_detection'
+  download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/models.tar.gz' 'sbb_binarization'
  download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
 }
 . $self_dir/qurator_data_lib.sh
@ -27,3 +29,4 @@ docker build --cache-from=my_ocrd_workflow-dinglehopper          -t my_ocrd_work
 docker build --cache-from=my_ocrd_workflow-ocrd_olena            -t my_ocrd_workflow-ocrd_olena            -f Dockerfile-ocrd_olena            .
 docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr        -t my_ocrd_workflow-ocrd_tesserocr        -f Dockerfile-ocrd_tesserocr        .
 docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .
+docker build --cache-from=my_ocrd_workflow-sbb_binarization      -t my_ocrd_workflow-sbb_binarization      -f Dockerfile-sbb_binarization      .
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 0cc78464e74fd295f00ee28a6f605ce0856d3db0
+Subproject commit bd1628e1aefbb0766655ee3726b4b0234d20a4f1
--- a/1
+++ b/1
@ -58,6 +58,7 @@ main() {
  do_validate


+  #ocrd-sbb-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization"
  ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split"
  do_validate

--- a/qurator_data_lib.sh
+++ b/qurator_data_lib.sh
@ -62,13 +62,37 @@ annex_get() {
  )
 }

+# Options:
+# --no-unpack                Do NOT unpack the file
+# --strip-components NUMBER  (as tar's option)
 download_to() {
  unpack=1
-  if [[ "$1" = '--no-unpack' ]]; then
-    unpack=0
-    shift
+  tar_options=""
+
+  _options=$(getopt --long no-unpack,strip-components: -- "" "$@")
+  if [[ $? != 0 ]]; then
+    echo "Bad parameters for download_to" >&2
+    exit 1
  fi
-  file_pattern="$1"
+  eval set -- "$_options"
+  while true; do
+    case "$1" in
+    --no-unpack)
+      unpack=0
+      ;;
+    --strip-components)
+      shift
+      components=$1
+      tar_options="$tar_options --strip-components $components"
+      ;;
+    --)
+      shift
+      break
+      ;;
+    esac
+    shift
+  done
+
  download_source="$1"
  dest="$2"

@ -79,7 +103,7 @@ download_to() {
    if [[ $unpack = 1 ]]; then
      mkdir -p "$dest"
      # Unpacking relies on tar -a unpacking any tar compression
-      tar -C "$dest" -af $tmpf -xv
+      tar -C "$dest" $tar_options -af $tmpf -xv
      rm -f $tmpf
    else
      dest_dir=`dirname "$dest"`
--- a/1
+++ b/1
@ -37,6 +37,7 @@ build_alias() {
 shopt -s expand_aliases  # Required for non-interactive shells
 build_alias ocrd                       ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG}
 build_alias ocrd-olena-binarize        ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG}
+build_alias ocrd-sbb-binarize          ${DOCKER_IMAGE_PREFIX}-sbb_binarization:${DOCKER_IMAGE_TAG}
 build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG}
 build_alias ocrd-calamari-recognize    ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG}
 build_alias ocrd-tesserocr-recognize   ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG}