🚧 Prepare supporting ocrd-sbb-binarize

ocrd-sbb-binarize seems to work but its input does not work with
ocrd-sbb-textline-detector:

https://github.com/qurator-spk/sbb_binarization/issues/8
https://github.com/qurator-spk/sbb_textline_detection/issues/47
pull/38/head
Gerber, Mike 4 years ago
parent 053fc0bc34
commit 0841af5491

@ -0,0 +1,22 @@
FROM my_ocrd_workflow-core
ARG PIP_INSTALL="pip3 install --no-cache-dir --use-feature=2020-resolver"
ARG SBB_BINARIZATION_COMMIT="3e60a62"
# Build pip installable stuff
RUN ${PIP_INSTALL} \
# Now the real stuff:
https://github.com/qurator-spk/sbb_binarization/archive/$SBB_BINARIZATION_COMMIT.tar.gz
# Copy models
COPY data/sbb_binarization /var/lib/sbb_binarization
# Check pip dependencies
RUN pip3 check
# Default command
CMD ["ocrd-sbb-binarize"]

@ -10,11 +10,13 @@ get_from_annex() {
annex_get 'tesseract-models/GT4HistOCR/*.traineddata'
annex_get 'textline_detection/*.h5'
annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
annex_get 'sbb_binarization/*.h5'
}
get_from_web() {
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR'
download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection'
download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/models.tar.gz' 'sbb_binarization'
download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
}
. $self_dir/qurator_data_lib.sh
@ -27,3 +29,4 @@ docker build --cache-from=my_ocrd_workflow-dinglehopper -t my_ocrd_work
docker build --cache-from=my_ocrd_workflow-ocrd_olena -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena .
docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr .
docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .
docker build --cache-from=my_ocrd_workflow-sbb_binarization -t my_ocrd_workflow-sbb_binarization -f Dockerfile-sbb_binarization .

@ -1 +1 @@
Subproject commit 0cc78464e74fd295f00ee28a6f605ce0856d3db0
Subproject commit bd1628e1aefbb0766655ee3726b4b0234d20a4f1

@ -58,6 +58,7 @@ main() {
do_validate
#ocrd-sbb-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization"
ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split"
do_validate

@ -62,13 +62,37 @@ annex_get() {
)
}
# Options:
# --no-unpack Do NOT unpack the file
# --strip-components NUMBER (as tar's option)
download_to() {
unpack=1
if [[ "$1" = '--no-unpack' ]]; then
tar_options=""
_options=$(getopt --long no-unpack,strip-components: -- "" "$@")
if [[ $? != 0 ]]; then
echo "Bad parameters for download_to" >&2
exit 1
fi
eval set -- "$_options"
while true; do
case "$1" in
--no-unpack)
unpack=0
;;
--strip-components)
shift
fi
file_pattern="$1"
components=$1
tar_options="$tar_options --strip-components $components"
;;
--)
shift
break
;;
esac
shift
done
download_source="$1"
dest="$2"
@ -79,7 +103,7 @@ download_to() {
if [[ $unpack = 1 ]]; then
mkdir -p "$dest"
# Unpacking relies on tar -a unpacking any tar compression
tar -C "$dest" -af $tmpf -xv
tar -C "$dest" $tar_options -af $tmpf -xv
rm -f $tmpf
else
dest_dir=`dirname "$dest"`

1
run

@ -37,6 +37,7 @@ build_alias() {
shopt -s expand_aliases # Required for non-interactive shells
build_alias ocrd ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG}
build_alias ocrd-olena-binarize ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG}
build_alias ocrd-sbb-binarize ${DOCKER_IMAGE_PREFIX}-sbb_binarization:${DOCKER_IMAGE_TAG}
build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG}
build_alias ocrd-calamari-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG}
build_alias ocrd-tesserocr-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG}

Loading…
Cancel
Save