mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-27 13:49:53 +02:00
🚧 Prepare supporting ocrd-sbb-binarize
ocrd-sbb-binarize seems to work but its input does not work with ocrd-sbb-textline-detector: https://github.com/qurator-spk/sbb_binarization/issues/8 https://github.com/qurator-spk/sbb_textline_detection/issues/47
This commit is contained in:
parent
053fc0bc34
commit
0841af5491
6 changed files with 57 additions and 6 deletions
22
Dockerfile-sbb_binarization
Normal file
22
Dockerfile-sbb_binarization
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
FROM my_ocrd_workflow-core
|
||||||
|
|
||||||
|
ARG PIP_INSTALL="pip3 install --no-cache-dir --use-feature=2020-resolver"
|
||||||
|
ARG SBB_BINARIZATION_COMMIT="3e60a62"
|
||||||
|
|
||||||
|
|
||||||
|
# Build pip installable stuff
|
||||||
|
RUN ${PIP_INSTALL} \
|
||||||
|
# Now the real stuff:
|
||||||
|
https://github.com/qurator-spk/sbb_binarization/archive/$SBB_BINARIZATION_COMMIT.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Copy models
|
||||||
|
COPY data/sbb_binarization /var/lib/sbb_binarization
|
||||||
|
|
||||||
|
|
||||||
|
# Check pip dependencies
|
||||||
|
RUN pip3 check
|
||||||
|
|
||||||
|
|
||||||
|
# Default command
|
||||||
|
CMD ["ocrd-sbb-binarize"]
|
3
build
3
build
|
@ -10,11 +10,13 @@ get_from_annex() {
|
||||||
annex_get 'tesseract-models/GT4HistOCR/*.traineddata'
|
annex_get 'tesseract-models/GT4HistOCR/*.traineddata'
|
||||||
annex_get 'textline_detection/*.h5'
|
annex_get 'textline_detection/*.h5'
|
||||||
annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
||||||
|
annex_get 'sbb_binarization/*.h5'
|
||||||
}
|
}
|
||||||
get_from_web() {
|
get_from_web() {
|
||||||
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
|
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
|
||||||
download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR'
|
download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR'
|
||||||
download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection'
|
download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection'
|
||||||
|
download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/models.tar.gz' 'sbb_binarization'
|
||||||
download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
||||||
}
|
}
|
||||||
. $self_dir/qurator_data_lib.sh
|
. $self_dir/qurator_data_lib.sh
|
||||||
|
@ -27,3 +29,4 @@ docker build --cache-from=my_ocrd_workflow-dinglehopper -t my_ocrd_work
|
||||||
docker build --cache-from=my_ocrd_workflow-ocrd_olena -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena .
|
docker build --cache-from=my_ocrd_workflow-ocrd_olena -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena .
|
||||||
docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr .
|
docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr .
|
||||||
docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .
|
docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .
|
||||||
|
docker build --cache-from=my_ocrd_workflow-sbb_binarization -t my_ocrd_workflow-sbb_binarization -f Dockerfile-sbb_binarization .
|
||||||
|
|
2
data
2
data
|
@ -1 +1 @@
|
||||||
Subproject commit 0cc78464e74fd295f00ee28a6f605ce0856d3db0
|
Subproject commit bd1628e1aefbb0766655ee3726b4b0234d20a4f1
|
|
@ -58,6 +58,7 @@ main() {
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
|
||||||
|
#ocrd-sbb-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization"
|
||||||
ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split"
|
ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split"
|
||||||
do_validate
|
do_validate
|
||||||
|
|
||||||
|
|
|
@ -62,13 +62,37 @@ annex_get() {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Options:
|
||||||
|
# --no-unpack Do NOT unpack the file
|
||||||
|
# --strip-components NUMBER (as tar's option)
|
||||||
download_to() {
|
download_to() {
|
||||||
unpack=1
|
unpack=1
|
||||||
if [[ "$1" = '--no-unpack' ]]; then
|
tar_options=""
|
||||||
unpack=0
|
|
||||||
shift
|
_options=$(getopt --long no-unpack,strip-components: -- "" "$@")
|
||||||
|
if [[ $? != 0 ]]; then
|
||||||
|
echo "Bad parameters for download_to" >&2
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
file_pattern="$1"
|
eval set -- "$_options"
|
||||||
|
while true; do
|
||||||
|
case "$1" in
|
||||||
|
--no-unpack)
|
||||||
|
unpack=0
|
||||||
|
;;
|
||||||
|
--strip-components)
|
||||||
|
shift
|
||||||
|
components=$1
|
||||||
|
tar_options="$tar_options --strip-components $components"
|
||||||
|
;;
|
||||||
|
--)
|
||||||
|
shift
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
download_source="$1"
|
download_source="$1"
|
||||||
dest="$2"
|
dest="$2"
|
||||||
|
|
||||||
|
@ -79,7 +103,7 @@ download_to() {
|
||||||
if [[ $unpack = 1 ]]; then
|
if [[ $unpack = 1 ]]; then
|
||||||
mkdir -p "$dest"
|
mkdir -p "$dest"
|
||||||
# Unpacking relies on tar -a unpacking any tar compression
|
# Unpacking relies on tar -a unpacking any tar compression
|
||||||
tar -C "$dest" -af $tmpf -xv
|
tar -C "$dest" $tar_options -af $tmpf -xv
|
||||||
rm -f $tmpf
|
rm -f $tmpf
|
||||||
else
|
else
|
||||||
dest_dir=`dirname "$dest"`
|
dest_dir=`dirname "$dest"`
|
||||||
|
|
1
run
1
run
|
@ -37,6 +37,7 @@ build_alias() {
|
||||||
shopt -s expand_aliases # Required for non-interactive shells
|
shopt -s expand_aliases # Required for non-interactive shells
|
||||||
build_alias ocrd ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG}
|
build_alias ocrd ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG}
|
||||||
build_alias ocrd-olena-binarize ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG}
|
build_alias ocrd-olena-binarize ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG}
|
||||||
|
build_alias ocrd-sbb-binarize ${DOCKER_IMAGE_PREFIX}-sbb_binarization:${DOCKER_IMAGE_TAG}
|
||||||
build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG}
|
build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG}
|
||||||
build_alias ocrd-calamari-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG}
|
build_alias ocrd-calamari-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG}
|
||||||
build_alias ocrd-tesserocr-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG}
|
build_alias ocrd-tesserocr-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue