From 699023c0843cd38dcc48f043c599242c594b3f28 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 16:21:25 +0200 Subject: [PATCH 01/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20+=20Update=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-core | 70 ----------------------- Dockerfile-dinglehopper | 18 ------ Dockerfile-eynollah | 18 ------ Dockerfile-ocrd_fileformat | 24 -------- Dockerfile-ocrd_olena | 39 ------------- Dockerfile-ocrd_segment | 19 ------ Dockerfile-ocrd_tesserocr | 31 ---------- Dockerfile-ocrd_wrap | 18 ------ Dockerfile-sbb_textline_detector | 20 ------- build | 33 ----------- test-ocrd_tesserocr.sh | 2 + test-sbb_binarization.sh | 4 +- test-sbb_textline_detector.sh | 16 ------ wrapper/qurator/ocrd_galley/cli.py | 24 ++------ wrapper/qurator/ocrd_galley/sub_images.py | 1 + 15 files changed, 10 insertions(+), 327 deletions(-) delete mode 100644 Dockerfile-core delete mode 100644 Dockerfile-dinglehopper delete mode 100644 Dockerfile-eynollah delete mode 100644 Dockerfile-ocrd_fileformat delete mode 100644 Dockerfile-ocrd_olena delete mode 100644 Dockerfile-ocrd_segment delete mode 100644 Dockerfile-ocrd_tesserocr delete mode 100644 Dockerfile-ocrd_wrap delete mode 100644 Dockerfile-sbb_textline_detector delete mode 100755 build delete mode 100755 test-sbb_textline_detector.sh diff --git a/Dockerfile-core b/Dockerfile-core deleted file mode 100644 index c655d56..0000000 --- a/Dockerfile-core +++ /dev/null @@ -1,70 +0,0 @@ -FROM ubuntu:22.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.47.0" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - build-essential \ - curl \ - git \ - xz-utils \ - pkg-config \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ -# pyenv builds -# TODO: builder container? - libz-dev \ - libssl-dev \ - libbz2-dev \ - liblzma-dev \ - libncurses-dev \ - libffi-dev \ - libreadline-dev \ - libsqlite3-dev \ - libmagic-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pyenv -# TODO: do not run as root -# TODO: does just saying "3.7" work as intended? -ENV HOME=/root -ENV PYENV_ROOT=/usr/local/share/pyenv -ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH -RUN \ - git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ - pyenv install 3.7 && \ - pyenv global 3.7 && \ - pyenv rehash && \ - pip install -U pip wheel && \ - pip install setuptools - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] diff --git a/Dockerfile-dinglehopper b/Dockerfile-dinglehopper deleted file mode 100644 index 765a1f2..0000000 --- a/Dockerfile-dinglehopper +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG DINGLEHOPPER_VERSION="0.9.2" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "dinglehopper == $DINGLEHOPPER_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-dinglehopper"] diff --git a/Dockerfile-eynollah b/Dockerfile-eynollah deleted file mode 100644 index 6505174..0000000 --- a/Dockerfile-eynollah +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG EYNOLLAH_VERSION="0.3.0" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "eynollah == ${EYNOLLAH_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-eynollah-segment"] diff --git a/Dockerfile-ocrd_fileformat b/Dockerfile-ocrd_fileformat deleted file mode 100644 index 060f79c..0000000 --- a/Dockerfile-ocrd_fileformat +++ /dev/null @@ -1,24 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_FILEFORMAT_VERSION="0.5.0" - - -RUN apt-get update && \ - apt-get install -y \ - git \ - openjdk-11-jdk-headless \ - wget \ - unzip \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* -RUN git clone --depth 1 --branch v${OCRD_FILEFORMAT_VERSION} https://github.com/OCR-D/ocrd_fileformat.git && \ - cd ocrd_fileformat/ && \ - git submodule update --init && \ - make install-fileformat install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_fileformat/ - - -# Default command -CMD ['ocrd-fileformat-transform'] diff --git a/Dockerfile-ocrd_olena b/Dockerfile-ocrd_olena deleted file mode 100644 index 29be067..0000000 --- a/Dockerfile-ocrd_olena +++ /dev/null @@ -1,39 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_OLENA_VERSION="1.3.0" - - -# Build ocrd_olena -RUN apt-get update && \ - apt-get install -y \ - imagemagick \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -# Install olena from .deb -RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1.0+ocrd-git+2-ubuntu22.04/olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - dpkg -i --force-depends olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - rm -f olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - apt-get update && \ - apt-get -f install -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* && \ - if ! scribo-cli sauvola --help >/dev/null 2>&1; then echo "Olena/scribo is not installed correctly" >&2; exit 1; fi -RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ - mkdir ocrd_olena && \ - tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ - cd ocrd_olena && \ - sed -i 's/^install: deps/install:/' Makefile && \ - ${PIP_INSTALL} ocrd && \ - apt install xmlstarlet && \ - make install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ['ocrd-olena-binarize'] diff --git a/Dockerfile-ocrd_segment b/Dockerfile-ocrd_segment deleted file mode 100644 index 284f45d..0000000 --- a/Dockerfile-ocrd_segment +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_SEGMENT_VERSION="0.1.22" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "ocrd-segment == ${OCRD_SEGMENT_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-segment-extract-regions"] diff --git a/Dockerfile-ocrd_tesserocr b/Dockerfile-ocrd_tesserocr deleted file mode 100644 index c046cfc..0000000 --- a/Dockerfile-ocrd_tesserocr +++ /dev/null @@ -1,31 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG TESSDATA_BEST_VERSION="4.0.0" -ARG OCRD_TESSEROCR_VERSION="0.17.0" -ENV TESSDATA_PREFIX /usr/local/share/tessdata - - -# Install Leptonica and Tesseract. -# TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1, -# alex-p has 4.1.3, but not for jammy.) -# RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ -RUN apt-get update && \ - apt-get install -y \ - tesseract-ocr \ - libtesseract-dev \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}" - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-tesserocr-recognize"] diff --git a/Dockerfile-ocrd_wrap b/Dockerfile-ocrd_wrap deleted file mode 100644 index 518d306..0000000 --- a/Dockerfile-ocrd_wrap +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_WRAP_VERSION="0.1.7" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_wrap == ${OCRD_WRAP_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-preprocess-image"] diff --git a/Dockerfile-sbb_textline_detector b/Dockerfile-sbb_textline_detector deleted file mode 100644 index 0569ab8..0000000 --- a/Dockerfile-sbb_textline_detector +++ /dev/null @@ -1,20 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - # https://github.com/qurator-spk/sbb_textline_detection/issues/50 - "h5py < 3" \ - https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-sbb-textline-detector"] diff --git a/build b/build deleted file mode 100755 index 96835cb..0000000 --- a/build +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -e - -self=`realpath $0` -self_dir=`dirname "$self"` - - - -if [ -n "$1" ]; then - sub_images="" - for arg in "$@"; do - arg_sub_image=`echo "$arg" | sed 's/Dockerfile-//'` - NL=$'\n' - sub_images+="$NL$arg_sub_image" - done -else - sub_images=`ls -1 Dockerfile-core* | sed 's/Dockerfile-//'` - sub_images="$sub_images `ls -1 Dockerfile-* | sed 's/Dockerfile-//'`" -fi -echo "Building:" -echo "$sub_images" -echo - - -# Update base images if we build a core image -if echo "$sub_images" | grep -q core; then - docker pull ubuntu:22.04 - docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 -fi - -for sub_image in $sub_images; do - docker build --cache-from=quratorspk/ocrd-galley-$sub_image -t quratorspk/ocrd-galley-$sub_image -f Dockerfile-$sub_image . -done diff --git a/test-ocrd_tesserocr.sh b/test-ocrd_tesserocr.sh index 3cd403c..007698d 100755 --- a/test-ocrd_tesserocr.sh +++ b/test-ocrd_tesserocr.sh @@ -5,6 +5,8 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors +ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata +ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata # Prepare test workspace diff --git a/test-sbb_binarization.sh b/test-sbb_binarization.sh index cc0afa9..4ecfbbf 100755 --- a/test-sbb_binarization.sh +++ b/test-sbb_binarization.sh @@ -5,7 +5,7 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors -ocrd resmgr download ocrd-sbb-binarize default-2021-03-09 +ocrd resmgr download ocrd-sbb-binarize default # Prepare test workspace wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip @@ -13,4 +13,4 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-sbb-binarize -P model default-2021-03-09 -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE +ocrd-sbb-binarize -P model default -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE diff --git a/test-sbb_textline_detector.sh b/test-sbb_textline_detector.sh deleted file mode 100755 index 9f940f1..0000000 --- a/test-sbb_textline_detector.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -set -ex - -test_id=`basename $0` -cd `mktemp -d /tmp/$test_id-XXXXX` - -# Prepare processors -ocrd resmgr download ocrd-sbb-textline-detector default - -# Prepare test workspace -wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip -unzip actevedef_718448162.first-page+binarization+segmentation.zip -cd actevedef_718448162.first-page+binarization+segmentation - -# Run tests -ocrd-sbb-textline-detector -P models default -I OCR-D-IMG-BIN -O TEST-EYNOLLAH-SEG diff --git a/wrapper/qurator/ocrd_galley/cli.py b/wrapper/qurator/ocrd_galley/cli.py index 9423c61..0263f42 100644 --- a/wrapper/qurator/ocrd_galley/cli.py +++ b/wrapper/qurator/ocrd_galley/cli.py @@ -5,10 +5,8 @@ import colorama from pathlib import Path from termcolor import colored -from .sub_images import sub_images -DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley") -DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest") +DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum") # TODO rename LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler @@ -26,22 +24,9 @@ def main(): argv = sys.argv.copy() argv[0] = os.path.basename(argv[0]) - # If we're running ocrd resmgr download we need to run the correct subimage. - if argv[:3] == ["ocrd", "resmgr", "download"] or \ - argv[:3] == ["ocrd", "resmgr", "list-available"]: - # Default to the base image - sub_image = sub_images[argv[0]] - # But look for a match of the executable - for x in argv[3:]: - if x in sub_images: - sub_image = sub_images[x] - break - else: - sub_image = sub_images[argv[0]] - - docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG) - - if DOCKER_IMAGE_TAG != "latest": + docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, ) + + if DOCKER_IMAGE_TAG != "maximum": print(colored(f"Using {docker_image}", 'red')) docker_run(argv, docker_image) @@ -50,6 +35,7 @@ def docker_run(argv, docker_image): docker_run_options = [] docker_run_options.extend(["--rm", "-t"]) docker_run_options.extend(["--mount", "type=bind,src=%s,target=/data" % os.getcwd()]) + docker_run_options.extend(["--mount", "type=tmpfs,target=/tmp"]) docker_run_options.extend(["--user", "%s:%s" % (os.getuid(), os.getgid())]) docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL]) docker_run_options.extend(["-e", "_OCRD_COMPLETE"]) diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py index aaea945..f2f4ae1 100644 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ b/wrapper/qurator/ocrd_galley/sub_images.py @@ -1,3 +1,4 @@ +# TODO is a list now, basically (no more sub images) sub_images = { "ocrd": "core", "ocrd-olena-binarize": "ocrd_olena", From ed96a49321b0e8ebcec4264c705b102c61d592cc Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 17:24:08 +0200 Subject: [PATCH 02/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20-=20ocrd=5Fanybaseocr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-ocrd_anybaseocr | 19 ------------------- test-ocrd_anybaseocr.sh | 16 ++++++++++++++++ wrapper/qurator/ocrd_galley/sub_images.py | 1 + 3 files changed, 17 insertions(+), 19 deletions(-) delete mode 100644 Dockerfile-ocrd_anybaseocr create mode 100755 test-ocrd_anybaseocr.sh diff --git a/Dockerfile-ocrd_anybaseocr b/Dockerfile-ocrd_anybaseocr deleted file mode 100644 index 6ce5d0e..0000000 --- a/Dockerfile-ocrd_anybaseocr +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_ANYBASEOCR_VERSION="1.8.2" - - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_anybaseocr == ${OCRD_ANYBASEOCR_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-anybaseocr-crop"] diff --git a/test-ocrd_anybaseocr.sh b/test-ocrd_anybaseocr.sh new file mode 100755 index 0000000..6d44615 --- /dev/null +++ b/test-ocrd_anybaseocr.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -ex + +test_id=`basename $0` +cd `mktemp -d /tmp/$test_id-XXXXX` + +# Prepare processors + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page.zip +unzip actevedef_718448162.first-page.zip +cd actevedef_718448162.first-page + +# Run tests +ocrd-anybaseocr-binarize -I OCR-D-IMG -O OCR-D-BIN -P operation_level page -P threshold 0.3 +ocrd-anybaseocr-deskew -I OCR-D-BIN -O OCR-D-DESKEW -P maxskew 5.0 -P skewsteps 20 -P operation_level page diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py index f2f4ae1..e532e82 100644 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ b/wrapper/qurator/ocrd_galley/sub_images.py @@ -32,6 +32,7 @@ sub_images = { "ocrd-skimage-binarize": "ocrd_wrap", "ocrd-skimage-denoise": "ocrd_wrap", "ocrd-eynollah-segment": "eynollah", + "ocrd-anybaseocr-binarize": "ocrd_anybaseocr", "ocrd-anybaseocr-crop": "ocrd_anybaseocr", "ocrd-anybaseocr-deskew": "ocrd_anybaseocr", "ocrd-trocr-recognize": "ocrd_trocr", From 097a7b7fc08afc3cc5bce360488481903d8b8f92 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 17:24:34 +0200 Subject: [PATCH 03/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20-=20sbb=5Fbinarization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-sbb_binarization | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 Dockerfile-sbb_binarization diff --git a/Dockerfile-sbb_binarization b/Dockerfile-sbb_binarization deleted file mode 100644 index a80aea0..0000000 --- a/Dockerfile-sbb_binarization +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG SBB_BINARIZATION_VERSION="0.1.0" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "sbb_binarization == $SBB_BINARIZATION_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-sbb-binarize"] From 70c0da0cec755b17499529d2711d8f59956e8744 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 20:27:58 +0200 Subject: [PATCH 04/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20-=20Remove=20qurator=5Fdata=5Flib.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator_data_lib.sh | 156 -------------------------------------------- 1 file changed, 156 deletions(-) delete mode 100644 qurator_data_lib.sh diff --git a/qurator_data_lib.sh b/qurator_data_lib.sh deleted file mode 100644 index 704d54d..0000000 --- a/qurator_data_lib.sh +++ /dev/null @@ -1,156 +0,0 @@ -# ______________________________________ -#/ always copy the file from \ -#| mono-repo/qurator_data_lib.sh, never | -#\ edit the copy in the project / -# -------------------------------------- -# \ ^__^ -# \ (oo)\_______ -# (__)\ )\/\ -# ||----w | -# || || - -if [ -z "$BASH" ]; then - echo "qurator_data_lib.sh uses bash features, please make sure to run $0 in bash" - exit 1 -fi - -check_data_subdir() { - result=0 - - if git submodule status $DATA_SUBDIR | grep -q '^-'; then - echo "$DATA_SUBDIR/ is not an initialized submodule"; result=1 - fi - if ! [ -e $DATA_SUBDIR/.git/annex ]; then - echo "$DATA_SUBDIR/ is not a git annex repository"; result=1 - fi - if ! (cd $DATA_SUBDIR && git annex version | egrep -q 'local repository version: (7|8)'); then - echo "$DATA_SUBDIR/ is not a git annex repository version 7 or 8"; result=1 - fi - if ! (cd $DATA_SUBDIR && git remote | grep -q '^nfs$'); then - echo "$DATA_SUBDIR/ has no git remote 'nfs'"; result=1 - fi - - return $result -} - -annex_get() { - if [[ "$1" = '--allow_symlinks' ]]; then - allow_symlinks=1 - shift - else - allow_symlinks=0 - fi - file_pattern="$1" - - ( - cd $DATA_SUBDIR - git annex get $file_pattern - - # fsck seems to be necessary to fix the files if we are in a submodule - git annex fsck $file_pattern - - # Check that there are no symlinks = only unlocked files. This is needed for - # Docker builds, as we cannot dereference symlinks in a Dockerfile COPY. - if [[ $allow_symlinks = 0 ]]; then - git ls-files $file_pattern | while read f; do - if ! [[ -f "$f" ]]; then - echo "$DATA_SUBDIR/$f is not a regular file – Is an unlock needed?" - exit - fi - done - fi - ) -} - -# Options: -# --no-unpack Do NOT unpack the file -# --strip-components NUMBER (as tar's option) -download_to() { - unpack=1 - tar_options="" - - _options=$(getopt --long no-unpack,strip-components: -- "" "$@") - if [[ $? != 0 ]]; then - echo "Bad parameters for download_to" >&2 - exit 1 - fi - eval set -- "$_options" - while true; do - case "$1" in - --no-unpack) - unpack=0 - ;; - --strip-components) - shift - components=$1 - tar_options="$tar_options --strip-components $components" - ;; - --) - shift - break - ;; - esac - shift - done - - download_source="$1" - dest="$2" - - ( - cd $DATA_SUBDIR - tmpf=`mktemp 'tmp.XXXXXX'` - curl -sSL -o $tmpf "$download_source" - if [[ $unpack = 1 ]]; then - mkdir -p "$dest" - # Unpacking relies on tar -a unpacking any tar compression - tar -C "$dest" $tar_options -af $tmpf -xv - rm -f $tmpf - else - dest_dir=`dirname "$dest"` - mkdir -p "$dest_dir" - mv $tmpf "$dest" - fi - ) -} - -suggest_commands() { - echo "Suggested commands:" - echo - echo "git submodule update --init" - echo "(cd $DATA_SUBDIR && git annex init --version=7)" - echo "(cd $DATA_SUBDIR && git remote add nfs annex@b-lx0053.sbb.spk-berlin.de:/var/lib/annex/qurator-data.git)" -} - -handle_data() { - if [[ "$1" = '--no-download' ]]; then - no_download=1 - shift - else - no_download=0 - fi - - if [ -n "$FORCE_DOWNLOAD" ]; then - get_from_web - elif ! check_data_subdir; then - if [[ $no_download = 1 ]]; then - select choice in "Abort to manually fix $DATA_SUBDIR submodule"; do - if [ $REPLY = 1 ]; then - suggest_commands - exit - fi - done - else - select choice in "Abort to manually fix $DATA_SUBDIR submodule" "Download data files from the web"; do - if [ $REPLY = 1 ]; then - suggest_commands - exit - else - get_from_web - break - fi - done - fi - else - get_from_annex - fi -} From 5dffd843aaa16ab4348740c89e845a43fd622224 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 20:29:29 +0200 Subject: [PATCH 05/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20-=20Move=20extra=20script=20to=20their?= =?UTF-8?q?=20own=20sub-directory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my_ocrd_workflow => extra/my_ocrd_workflow | 0 my_ocrd_workflow-sbb => extra/my_ocrd_workflow-sbb | 0 ocrd-workspace-from-images => extra/ocrd-workspace-from-images | 0 ppn2ocr => extra/ppn2ocr | 0 zdb2ocr => extra/zdb2ocr | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename my_ocrd_workflow => extra/my_ocrd_workflow (100%) rename my_ocrd_workflow-sbb => extra/my_ocrd_workflow-sbb (100%) rename ocrd-workspace-from-images => extra/ocrd-workspace-from-images (100%) rename ppn2ocr => extra/ppn2ocr (100%) rename zdb2ocr => extra/zdb2ocr (100%) diff --git a/my_ocrd_workflow b/extra/my_ocrd_workflow similarity index 100% rename from my_ocrd_workflow rename to extra/my_ocrd_workflow diff --git a/my_ocrd_workflow-sbb b/extra/my_ocrd_workflow-sbb similarity index 100% rename from my_ocrd_workflow-sbb rename to extra/my_ocrd_workflow-sbb diff --git a/ocrd-workspace-from-images b/extra/ocrd-workspace-from-images similarity index 100% rename from ocrd-workspace-from-images rename to extra/ocrd-workspace-from-images diff --git a/ppn2ocr b/extra/ppn2ocr similarity index 100% rename from ppn2ocr rename to extra/ppn2ocr diff --git a/zdb2ocr b/extra/zdb2ocr similarity index 100% rename from zdb2ocr rename to extra/zdb2ocr From 32372522720faafdf523e26c0464adb4a371e5fc Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 20:31:07 +0200 Subject: [PATCH 06/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20-=20Move=20extra=20script=20to=20their?= =?UTF-8?q?=20own=20sub-directory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements-ppn2ocr.txt => extra/requirements-ppn2ocr.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename requirements-ppn2ocr.txt => extra/requirements-ppn2ocr.txt (100%) diff --git a/requirements-ppn2ocr.txt b/extra/requirements-ppn2ocr.txt similarity index 100% rename from requirements-ppn2ocr.txt rename to extra/requirements-ppn2ocr.txt From b8510409d72ce5a4625ee555ff54426a1add70c1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 21:39:08 +0200 Subject: [PATCH 07/13] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20usin?= =?UTF-8?q?g=20ocrd:all=20image=20-=20ocrd=5Fcalamari?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-ocrd_calamari | 20 -------------------- Dockerfile-ocrd_calamari03 | 22 ---------------------- test-ocrd_calamari.sh | 16 ++++++++++++++++ 3 files changed, 16 insertions(+), 42 deletions(-) delete mode 100644 Dockerfile-ocrd_calamari delete mode 100644 Dockerfile-ocrd_calamari03 create mode 100755 test-ocrd_calamari.sh diff --git a/Dockerfile-ocrd_calamari b/Dockerfile-ocrd_calamari deleted file mode 100644 index 3b9d9cc..0000000 --- a/Dockerfile-ocrd_calamari +++ /dev/null @@ -1,20 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - - -# XXX https://github.com/OCR-D/core/issues/642 -#ARG PIP_INSTALL="pip install --no-cache-dir" -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_CALAMARI_VERSION="1.0.5" - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_calamari == $OCRD_CALAMARI_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-calamari-recognize"] diff --git a/Dockerfile-ocrd_calamari03 b/Dockerfile-ocrd_calamari03 deleted file mode 100644 index 5a8be3d..0000000 --- a/Dockerfile-ocrd_calamari03 +++ /dev/null @@ -1,22 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Resolve conflicts early: - 'tensorflow-gpu == 1.15.*' \ - 'calamari-ocr == 0.3.5' \ -# Now the real stuff: - 'ocrd_calamari == 0.0.7' - - -# Check pip dependencies -RUN pip check - - -# Default command -RUN ln -s ocrd-calamari-recognize /usr/local/bin/ocrd-calamari-recognize03 -CMD ["ocrd-calamari-recognize"] diff --git a/test-ocrd_calamari.sh b/test-ocrd_calamari.sh new file mode 100755 index 0000000..1726e26 --- /dev/null +++ b/test-ocrd_calamari.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -ex + +test_id=`basename $0` +cd `mktemp -d /tmp/$test_id-XXXXX` + +# Prepare processors +ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip +unzip actevedef_718448162.first-page+binarization+segmentation.zip +cd actevedef_718448162.first-page+binarization+segmentation + +# Run tests +ocrd-calamari-recognize -I OCR-D-SEG-LINE-SBB -O OCR-D-OCR-CALA -P checkpoint_dir qurator-gt4histocr-1.0 From 068fff829dde121cbe0603ec822cf0b598b51481 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 6 May 2024 21:59:55 +0200 Subject: [PATCH 08/13] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20CUDA=20test=20for?= =?UTF-8?q?=20now?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test-core-cuda12.1.sh | 1 - 1 file changed, 1 deletion(-) delete mode 120000 test-core-cuda12.1.sh diff --git a/test-core-cuda12.1.sh b/test-core-cuda12.1.sh deleted file mode 120000 index 2ee3591..0000000 --- a/test-core-cuda12.1.sh +++ /dev/null @@ -1 +0,0 @@ -test-core.sh \ No newline at end of file From 0a711c19dc2def50f2baebce260c7b45a64817ce Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 6 May 2024 22:05:27 +0200 Subject: [PATCH 09/13] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20CUDA=20test=20for?= =?UTF-8?q?=20now?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-core-cuda12.1 | 70 ---------------------------------------- 1 file changed, 70 deletions(-) delete mode 100644 Dockerfile-core-cuda12.1 diff --git a/Dockerfile-core-cuda12.1 b/Dockerfile-core-cuda12.1 deleted file mode 100644 index c494a2c..0000000 --- a/Dockerfile-core-cuda12.1 +++ /dev/null @@ -1,70 +0,0 @@ -FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.47.0" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - build-essential \ - curl \ - git \ - xz-utils \ - pkg-config \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ -# pyenv builds -# TODO: builder container? - libz-dev \ - libssl-dev \ - libbz2-dev \ - liblzma-dev \ - libncurses-dev \ - libffi-dev \ - libreadline-dev \ - libsqlite3-dev \ - libmagic-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pyenv -# TODO: do not run as root -# TODO: does just saying "3.7" work as intended? -ENV HOME=/root -ENV PYENV_ROOT=/usr/local/share/pyenv -ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH -RUN \ - git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ - pyenv install 3.7 && \ - pyenv global 3.7 && \ - pyenv rehash && \ - pip install -U pip wheel && \ - pip install setuptools - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] From e702939a44df206132a29c65f9b587906e5bd767 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 6 May 2024 22:26:07 +0200 Subject: [PATCH 10/13] =?UTF-8?q?=E2=9C=A8=20Add=20wrapper=20commands=20di?= =?UTF-8?q?nglehopper=20+=20dinglehopper-extract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- wrapper/qurator/ocrd_galley/sub_images.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py index e532e82..560fe0a 100644 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ b/wrapper/qurator/ocrd_galley/sub_images.py @@ -39,4 +39,6 @@ sub_images = { # non OCR-D CLI "ocr-transform": "ocrd_fileformat", + "dinglehopper": "XXX now ocrd_all", + "dinglehopper-extract": "XXX now ocrd_all", } From 956de7492f9061e78d473260ef3f12ef50b8a577 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 6 May 2024 23:40:55 +0200 Subject: [PATCH 11/13] =?UTF-8?q?=F0=9F=90=9B=20Fix/complete=20ocrd=5Fcis?= =?UTF-8?q?=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-ocrd_cis | 19 ------------------- test-ocrd_cis.sh | 26 +++++++++++++++++++++----- 2 files changed, 21 insertions(+), 24 deletions(-) delete mode 100644 Dockerfile-ocrd_cis diff --git a/Dockerfile-ocrd_cis b/Dockerfile-ocrd_cis deleted file mode 100644 index e967893..0000000 --- a/Dockerfile-ocrd_cis +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_CIS_VERSION="0.1.5" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "https://github.com/cisocrgroup/ocrd_cis/archive/v${OCRD_CIS_VERSION}.tar.gz" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-cis-ocropy-segment"] diff --git a/test-ocrd_cis.sh b/test-ocrd_cis.sh index 39afd91..f998df2 100755 --- a/test-ocrd_cis.sh +++ b/test-ocrd_cis.sh @@ -4,11 +4,27 @@ set -ex test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` +OCRD_CIS_OCROPY_MODEL=fraktur.pyrnn.gz + +# Prepare processors +ocrd resmgr download ocrd-cis-ocropy-recognize $OCRD_CIS_OCROPY_MODEL + # Prepare test workspace -wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip -unzip actevedef_718448162.first-page+binarization+segmentation.zip -cd actevedef_718448162.first-page+binarization+segmentation +wget https://qurator-data.de/examples/actevedef_718448162.first-page.zip +unzip actevedef_718448162.first-page.zip +cd actevedef_718448162.first-page + +# XXX ocrd-cis-ocropy-segment wasn't happy with the binarized input (no +# "binarized" AlternativeImage?!), so we do it here again +ocrd-skimage-binarize -I OCR-D-IMG -O OCR-D-IMG-BIN # Run tests -ocrd-cis-ocropy-segment -I OCR-D-IMG-BIN -O TEST-CIS-OCRPY-SEGMENT -# TODO -recognize +ocrd-cis-ocropy-segment \ + -I OCR-D-IMG-BIN -O TEST-CIS-OCROPY-SEG-LINE \ + -P level-of-operation page +test "$(grep TextLine TEST-CIS-OCROPY-SEG-LINE/*.xml | wc -l)" -gt 50 + +ocrd-cis-ocropy-recognize \ + -I TEST-CIS-OCROPY-SEG-LINE -O TEST-CIS-OCROPY-OCR \ + -P model $OCRD_CIS_OCROPY_MODEL +test "$(grep Unicode TEST-CIS-OCROPY-OCR/*.xml | wc -l)" -gt 50 From b01d2ca6a183e0794181958ff31af7ee2d0a4821 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 7 May 2024 18:11:56 +0200 Subject: [PATCH 12/13] =?UTF-8?q?=F0=9F=90=9B=20Fix=20ocrd=5Ftrocr=20(by?= =?UTF-8?q?=20adding=20custom=20image)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .dockerignore | 1 + Dockerfile-ocrd_trocr | 7 ++- build.sh | 4 ++ test-ocrd_trocr.sh | 4 +- wrapper/qurator/ocrd_galley/cli.py | 21 +++++++-- .../qurator/ocrd_galley/processor_images.py | 45 +++++++++++++++++++ wrapper/qurator/ocrd_galley/sub_images.py | 44 ------------------ 7 files changed, 74 insertions(+), 52 deletions(-) create mode 100644 .dockerignore create mode 100755 build.sh create mode 100644 wrapper/qurator/ocrd_galley/processor_images.py delete mode 100644 wrapper/qurator/ocrd_galley/sub_images.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6b8710a --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.git diff --git a/Dockerfile-ocrd_trocr b/Dockerfile-ocrd_trocr index fc05759..ef4fe15 100644 --- a/Dockerfile-ocrd_trocr +++ b/Dockerfile-ocrd_trocr @@ -1,8 +1,7 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT +FROM ocrd/all:maximum -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_TROCR_COMMIT="250ff1c" +ARG PIP_INSTALL="pip3 install --no-cache-dir" +ARG OCRD_TROCR_COMMIT="30696cb" # Build pip installable stuff diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..977e460 --- /dev/null +++ b/build.sh @@ -0,0 +1,4 @@ +#!/bin/sh +set -ex + +docker build . -t ocrd_trocr:latest -f Dockerfile-ocrd_trocr diff --git a/test-ocrd_trocr.sh b/test-ocrd_trocr.sh index d04be15..68295a9 100755 --- a/test-ocrd_trocr.sh +++ b/test-ocrd_trocr.sh @@ -12,4 +12,6 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR +ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR + +# TODO Does not use a useful model, does not check that text was recognize diff --git a/wrapper/qurator/ocrd_galley/cli.py b/wrapper/qurator/ocrd_galley/cli.py index 0263f42..a024a60 100644 --- a/wrapper/qurator/ocrd_galley/cli.py +++ b/wrapper/qurator/ocrd_galley/cli.py @@ -5,8 +5,9 @@ import colorama from pathlib import Path from termcolor import colored +from .processor_images import processor_images + -DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum") # TODO rename LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler @@ -24,9 +25,23 @@ def main(): argv = sys.argv.copy() argv[0] = os.path.basename(argv[0]) - docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, ) - if DOCKER_IMAGE_TAG != "maximum": + # If we're running ocrd resmgr download we need to run the correct subimage. + if argv[:3] == ["ocrd", "resmgr", "download"] or \ + argv[:3] == ["ocrd", "resmgr", "list-available"]: + # Default to the base image + processor_image = processor_images[argv[0]] + # But look for a match of the executable + for x in argv[3:]: + if x in processor_images: + processor_image = processor_images[x] + break + else: + processor_image = processor_images[argv[0]] + + docker_image = processor_image + + if docker_image != "ocrd/all:maximum": print(colored(f"Using {docker_image}", 'red')) docker_run(argv, docker_image) diff --git a/wrapper/qurator/ocrd_galley/processor_images.py b/wrapper/qurator/ocrd_galley/processor_images.py new file mode 100644 index 0000000..da11bd0 --- /dev/null +++ b/wrapper/qurator/ocrd_galley/processor_images.py @@ -0,0 +1,45 @@ +processor_images = { + "ocrd": "ocrd/all:maximum", + "ocrd-olena-binarize": "ocrd/all:maximum", + "ocrd-sbb-binarize": "ocrd/all:maximum", + "ocrd-sbb-textline-detector": "ocrd/all:maximum", + "ocrd-calamari-recognize": "ocrd/all:maximum", + "ocrd-calamari-recognize03": "ocrd/all:maximum", + "ocrd-tesserocr-segment-region": "ocrd/all:maximum", + "ocrd-tesserocr-segment-line": "ocrd/all:maximum", + "ocrd-tesserocr-recognize": "ocrd/all:maximum", + "ocrd-dinglehopper": "ocrd/all:maximum", + "ocrd-cis-ocropy-clip": "ocrd/all:maximum", + "ocrd-cis-ocropy-resegment": "ocrd/all:maximum", + "ocrd-cis-ocropy-segment": "ocrd/all:maximum", + "ocrd-cis-ocropy-deskew": "ocrd/all:maximum", + "ocrd-cis-ocropy-denoise": "ocrd/all:maximum", + "ocrd-cis-ocropy-binarize": "ocrd/all:maximum", + "ocrd-cis-ocropy-dewarp": "ocrd/all:maximum", + "ocrd-cis-ocropy-recognize": "ocrd/all:maximum", + "ocrd-fileformat-transform": "ocrd/all:maximum", + "ocrd-segment-extract-pages": "ocrd/all:maximum", + "ocrd-segment-extract-regions": "ocrd/all:maximum", + "ocrd-segment-extract-lines": "ocrd/all:maximum", + "ocrd-segment-from-masks": "ocrd/all:maximum", + "ocrd-segment-from-coco": "ocrd/all:maximum", + "ocrd-segment-repair": "ocrd/all:maximum", + "ocrd-segment-evaluate": "ocrd/all:maximum", + "ocrd-preprocess-image": "ocrd/all:maximum", + "ocrd-skimage-normalize": "ocrd/all:maximum", + "ocrd-skimage-denoise-raw": "ocrd/all:maximum", + "ocrd-skimage-binarize": "ocrd/all:maximum", + "ocrd-skimage-denoise": "ocrd/all:maximum", + "ocrd-eynollah-segment": "ocrd/all:maximum", + "ocrd-anybaseocr-binarize": "ocrd/all:maximum", + "ocrd-anybaseocr-crop": "ocrd/all:maximum", + "ocrd-anybaseocr-deskew": "ocrd/all:maximum", + + # non OCR-D CLI + "ocr-transform": "ocrd/all:maximum", + "dinglehopper": "ocrd/all:maximum", + "dinglehopper-extract": "ocrd/all:maximum", + + # specialized images + "ocrd-trocr-recognize": "ocrd_trocr", +} diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py deleted file mode 100644 index 560fe0a..0000000 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ /dev/null @@ -1,44 +0,0 @@ -# TODO is a list now, basically (no more sub images) -sub_images = { - "ocrd": "core", - "ocrd-olena-binarize": "ocrd_olena", - "ocrd-sbb-binarize": "sbb_binarization", - "ocrd-sbb-textline-detector": "sbb_textline_detector", - "ocrd-calamari-recognize": "ocrd_calamari", - "ocrd-calamari-recognize03": "ocrd_calamari03", - "ocrd-tesserocr-segment-region": "ocrd_tesserocr", - "ocrd-tesserocr-segment-line": "ocrd_tesserocr", - "ocrd-tesserocr-recognize": "ocrd_tesserocr", - "ocrd-dinglehopper": "dinglehopper", - "ocrd-cis-ocropy-clip": "ocrd_cis", - "ocrd-cis-ocropy-resegment": "ocrd_cis", - "ocrd-cis-ocropy-segment": "ocrd_cis", - "ocrd-cis-ocropy-deskew": "ocrd_cis", - "ocrd-cis-ocropy-denoise": "ocrd_cis", - "ocrd-cis-ocropy-binarize": "ocrd_cis", - "ocrd-cis-ocropy-dewarp": "ocrd_cis", - "ocrd-cis-ocropy-recognize": "ocrd_cis", - "ocrd-fileformat-transform": "ocrd_fileformat", - "ocrd-segment-extract-pages": "ocrd_segment", - "ocrd-segment-extract-regions": "ocrd_segment", - "ocrd-segment-extract-lines": "ocrd_segment", - "ocrd-segment-from-masks": "ocrd_segment", - "ocrd-segment-from-coco": "ocrd_segment", - "ocrd-segment-repair": "ocrd_segment", - "ocrd-segment-evaluate": "ocrd_segment", - "ocrd-preprocess-image": "ocrd_wrap", - "ocrd-skimage-normalize": "ocrd_wrap", - "ocrd-skimage-denoise-raw": "ocrd_wrap", - "ocrd-skimage-binarize": "ocrd_wrap", - "ocrd-skimage-denoise": "ocrd_wrap", - "ocrd-eynollah-segment": "eynollah", - "ocrd-anybaseocr-binarize": "ocrd_anybaseocr", - "ocrd-anybaseocr-crop": "ocrd_anybaseocr", - "ocrd-anybaseocr-deskew": "ocrd_anybaseocr", - "ocrd-trocr-recognize": "ocrd_trocr", - - # non OCR-D CLI - "ocr-transform": "ocrd_fileformat", - "dinglehopper": "XXX now ocrd_all", - "dinglehopper-extract": "XXX now ocrd_all", -} From 7b5f593709250fe10c21902518520ea74e7b3878 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 7 May 2024 18:33:36 +0200 Subject: [PATCH 13/13] =?UTF-8?q?=E2=9C=A8=20Add=20a=20simple=20test=20run?= =?UTF-8?q?ner=20and=20move=20tests=20to=20subdirectory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test.sh | 24 +++++++++++++++++++ test-core.sh => tests/test-core.sh | 0 .../test-dinglehopper.sh | 0 test-eynollah.sh => tests/test-eynollah.sh | 0 .../test-ocrd_anybaseocr.sh | 0 .../test-ocrd_calamari.sh | 0 test-ocrd_cis.sh => tests/test-ocrd_cis.sh | 0 .../test-ocrd_fileformat.sh | 0 .../test-ocrd_olena.sh | 0 .../test-ocrd_segment.sh | 0 .../test-ocrd_tesserocr.sh | 0 .../test-ocrd_trocr.sh | 0 test-ocrd_wrap.sh => tests/test-ocrd_wrap.sh | 0 .../test-sbb_binarization.sh | 0 14 files changed, 24 insertions(+) create mode 100755 test.sh rename test-core.sh => tests/test-core.sh (100%) rename test-dinglehopper.sh => tests/test-dinglehopper.sh (100%) rename test-eynollah.sh => tests/test-eynollah.sh (100%) rename test-ocrd_anybaseocr.sh => tests/test-ocrd_anybaseocr.sh (100%) rename test-ocrd_calamari.sh => tests/test-ocrd_calamari.sh (100%) rename test-ocrd_cis.sh => tests/test-ocrd_cis.sh (100%) rename test-ocrd_fileformat.sh => tests/test-ocrd_fileformat.sh (100%) rename test-ocrd_olena.sh => tests/test-ocrd_olena.sh (100%) rename test-ocrd_segment.sh => tests/test-ocrd_segment.sh (100%) rename test-ocrd_tesserocr.sh => tests/test-ocrd_tesserocr.sh (100%) rename test-ocrd_trocr.sh => tests/test-ocrd_trocr.sh (100%) rename test-ocrd_wrap.sh => tests/test-ocrd_wrap.sh (100%) rename test-sbb_binarization.sh => tests/test-sbb_binarization.sh (100%) diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..f588bd6 --- /dev/null +++ b/test.sh @@ -0,0 +1,24 @@ +#!/bin/sh +set -e + +count_ok=0 +count_failed=0 + +for test in tests/*.sh; do + echo "== $test" + $test && result=$? || result=$? + + if [[ $result = 0 ]]; then + echo "✔" + count_ok=$((count_ok+1)) + else + echo "❌" + count_failed=$((count_failed+1)) + fi + echo +done + +echo "$count_ok ok, $count_failed failed" +if [[ $count_failed -gt 0 ]]; then + exit 1 +fi diff --git a/test-core.sh b/tests/test-core.sh similarity index 100% rename from test-core.sh rename to tests/test-core.sh diff --git a/test-dinglehopper.sh b/tests/test-dinglehopper.sh similarity index 100% rename from test-dinglehopper.sh rename to tests/test-dinglehopper.sh diff --git a/test-eynollah.sh b/tests/test-eynollah.sh similarity index 100% rename from test-eynollah.sh rename to tests/test-eynollah.sh diff --git a/test-ocrd_anybaseocr.sh b/tests/test-ocrd_anybaseocr.sh similarity index 100% rename from test-ocrd_anybaseocr.sh rename to tests/test-ocrd_anybaseocr.sh diff --git a/test-ocrd_calamari.sh b/tests/test-ocrd_calamari.sh similarity index 100% rename from test-ocrd_calamari.sh rename to tests/test-ocrd_calamari.sh diff --git a/test-ocrd_cis.sh b/tests/test-ocrd_cis.sh similarity index 100% rename from test-ocrd_cis.sh rename to tests/test-ocrd_cis.sh diff --git a/test-ocrd_fileformat.sh b/tests/test-ocrd_fileformat.sh similarity index 100% rename from test-ocrd_fileformat.sh rename to tests/test-ocrd_fileformat.sh diff --git a/test-ocrd_olena.sh b/tests/test-ocrd_olena.sh similarity index 100% rename from test-ocrd_olena.sh rename to tests/test-ocrd_olena.sh diff --git a/test-ocrd_segment.sh b/tests/test-ocrd_segment.sh similarity index 100% rename from test-ocrd_segment.sh rename to tests/test-ocrd_segment.sh diff --git a/test-ocrd_tesserocr.sh b/tests/test-ocrd_tesserocr.sh similarity index 100% rename from test-ocrd_tesserocr.sh rename to tests/test-ocrd_tesserocr.sh diff --git a/test-ocrd_trocr.sh b/tests/test-ocrd_trocr.sh similarity index 100% rename from test-ocrd_trocr.sh rename to tests/test-ocrd_trocr.sh diff --git a/test-ocrd_wrap.sh b/tests/test-ocrd_wrap.sh similarity index 100% rename from test-ocrd_wrap.sh rename to tests/test-ocrd_wrap.sh diff --git a/test-sbb_binarization.sh b/tests/test-sbb_binarization.sh similarity index 100% rename from test-sbb_binarization.sh rename to tests/test-sbb_binarization.sh