mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-01 01:19:52 +02:00
🚧 Prepare supporting ocrd-sbb-binarize
ocrd-sbb-binarize seems to work but its input does not work with ocrd-sbb-textline-detector: https://github.com/qurator-spk/sbb_binarization/issues/8 https://github.com/qurator-spk/sbb_textline_detection/issues/47
This commit is contained in:
parent
053fc0bc34
commit
0841af5491
6 changed files with 57 additions and 6 deletions
22
Dockerfile-sbb_binarization
Normal file
22
Dockerfile-sbb_binarization
Normal file
|
@ -0,0 +1,22 @@
|
|||
FROM my_ocrd_workflow-core
|
||||
|
||||
ARG PIP_INSTALL="pip3 install --no-cache-dir --use-feature=2020-resolver"
|
||||
ARG SBB_BINARIZATION_COMMIT="3e60a62"
|
||||
|
||||
|
||||
# Build pip installable stuff
|
||||
RUN ${PIP_INSTALL} \
|
||||
# Now the real stuff:
|
||||
https://github.com/qurator-spk/sbb_binarization/archive/$SBB_BINARIZATION_COMMIT.tar.gz
|
||||
|
||||
|
||||
# Copy models
|
||||
COPY data/sbb_binarization /var/lib/sbb_binarization
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
# Default command
|
||||
CMD ["ocrd-sbb-binarize"]
|
3
build
3
build
|
@ -10,11 +10,13 @@ get_from_annex() {
|
|||
annex_get 'tesseract-models/GT4HistOCR/*.traineddata'
|
||||
annex_get 'textline_detection/*.h5'
|
||||
annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
||||
annex_get 'sbb_binarization/*.h5'
|
||||
}
|
||||
get_from_web() {
|
||||
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
|
||||
download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR'
|
||||
download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection'
|
||||
download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/models.tar.gz' 'sbb_binarization'
|
||||
download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
||||
}
|
||||
. $self_dir/qurator_data_lib.sh
|
||||
|
@ -27,3 +29,4 @@ docker build --cache-from=my_ocrd_workflow-dinglehopper -t my_ocrd_work
|
|||
docker build --cache-from=my_ocrd_workflow-ocrd_olena -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena .
|
||||
docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr .
|
||||
docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .
|
||||
docker build --cache-from=my_ocrd_workflow-sbb_binarization -t my_ocrd_workflow-sbb_binarization -f Dockerfile-sbb_binarization .
|
||||
|
|
2
data
2
data
|
@ -1 +1 @@
|
|||
Subproject commit 0cc78464e74fd295f00ee28a6f605ce0856d3db0
|
||||
Subproject commit bd1628e1aefbb0766655ee3726b4b0234d20a4f1
|
|
@ -58,6 +58,7 @@ main() {
|
|||
do_validate
|
||||
|
||||
|
||||
#ocrd-sbb-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization"
|
||||
ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split"
|
||||
do_validate
|
||||
|
||||
|
|
|
@ -62,13 +62,37 @@ annex_get() {
|
|||
)
|
||||
}
|
||||
|
||||
# Options:
|
||||
# --no-unpack Do NOT unpack the file
|
||||
# --strip-components NUMBER (as tar's option)
|
||||
download_to() {
|
||||
unpack=1
|
||||
if [[ "$1" = '--no-unpack' ]]; then
|
||||
unpack=0
|
||||
shift
|
||||
tar_options=""
|
||||
|
||||
_options=$(getopt --long no-unpack,strip-components: -- "" "$@")
|
||||
if [[ $? != 0 ]]; then
|
||||
echo "Bad parameters for download_to" >&2
|
||||
exit 1
|
||||
fi
|
||||
file_pattern="$1"
|
||||
eval set -- "$_options"
|
||||
while true; do
|
||||
case "$1" in
|
||||
--no-unpack)
|
||||
unpack=0
|
||||
;;
|
||||
--strip-components)
|
||||
shift
|
||||
components=$1
|
||||
tar_options="$tar_options --strip-components $components"
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
download_source="$1"
|
||||
dest="$2"
|
||||
|
||||
|
@ -79,7 +103,7 @@ download_to() {
|
|||
if [[ $unpack = 1 ]]; then
|
||||
mkdir -p "$dest"
|
||||
# Unpacking relies on tar -a unpacking any tar compression
|
||||
tar -C "$dest" -af $tmpf -xv
|
||||
tar -C "$dest" $tar_options -af $tmpf -xv
|
||||
rm -f $tmpf
|
||||
else
|
||||
dest_dir=`dirname "$dest"`
|
||||
|
|
1
run
1
run
|
@ -37,6 +37,7 @@ build_alias() {
|
|||
shopt -s expand_aliases # Required for non-interactive shells
|
||||
build_alias ocrd ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG}
|
||||
build_alias ocrd-olena-binarize ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG}
|
||||
build_alias ocrd-sbb-binarize ${DOCKER_IMAGE_PREFIX}-sbb_binarization:${DOCKER_IMAGE_TAG}
|
||||
build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG}
|
||||
build_alias ocrd-calamari-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG}
|
||||
build_alias ocrd-tesserocr-recognize ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue