🚧 WIP: Migrate to using ocrd:all image + Update tests

2025-07-27 21:59:52 +02:00 · 2024-04-25 16:21:25 +02:00 · 2024-04-25 16:21:25 +02:00 · 699023c084
commit 699023c084
parent fc911f3734
15 changed files with 9 additions and 326 deletions
--- a/70
+++ b/70
@ -1,70 +0,0 @@
-FROM ubuntu:22.04
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG OCRD_VERSION_MINIMUM="2.47.0"
-ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
-ENV PIP_DEFAULT_TIMEOUT=120
-
-
-RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
-    apt-get update && \
-    apt-get install -y \
-      build-essential \
-      curl \
-      git \
-      xz-utils \
-      pkg-config \
-# For add-apt-repository:
-      software-properties-common \
-# XML utils
-      libxml2-utils \
-      xmlstarlet \
-# OCR-D uses ImageMagick for pixel density estimation
-      imagemagick \
-# pyenv builds
-# TODO: builder container?
-      libz-dev \
-      libssl-dev \
-      libbz2-dev \
-      liblzma-dev \
-      libncurses-dev \
-      libffi-dev \
-      libreadline-dev \
-      libsqlite3-dev \
-      libmagic-dev \
-    && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-
-# Set up OCR-D logging
-RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
-
-
-# Install pyenv
-# TODO: do not run as root
-# TODO: does just saying "3.7" work as intended?
-ENV HOME=/root
-ENV PYENV_ROOT=/usr/local/share/pyenv
-ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
-RUN \
-    git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \
-    pyenv install 3.7 && \
-    pyenv global 3.7 && \
-    pyenv rehash && \
-    pip install -U pip wheel && \
-    pip install setuptools
-
-# Install pip installable-stuff
-RUN ${PIP_INSTALL} \
-        "ocrd >= ${OCRD_VERSION_MINIMUM}"
-
-
-# Check pip dependencies
-RUN pip check
-
-
-WORKDIR /data
-
-# Default command
-CMD ['ocrd']
--- a/18
+++ b/18
@ -1,18 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG DINGLEHOPPER_VERSION="0.9.2"
-
-
-# Build pip installable stuff
-RUN ${PIP_INSTALL} \
-        "dinglehopper == $DINGLEHOPPER_VERSION"
-
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ["ocrd-dinglehopper"]
--- a/18
+++ b/18
@ -1,18 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG EYNOLLAH_VERSION="0.3.0"
-
-
-# Build pip installable stuff
-RUN ${PIP_INSTALL} \
-    "eynollah == ${EYNOLLAH_VERSION}"
-
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ["ocrd-eynollah-segment"]
--- a/24
+++ b/24
@ -1,24 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG OCRD_FILEFORMAT_VERSION="0.5.0"
-
-
-RUN apt-get update && \
-    apt-get install -y \
-      git \
-      openjdk-11-jdk-headless \
-      wget \
-      unzip \
-    && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-RUN git clone --depth 1 --branch v${OCRD_FILEFORMAT_VERSION} https://github.com/OCR-D/ocrd_fileformat.git && \
-    cd ocrd_fileformat/ && \
-    git submodule update --init && \
-    make install-fileformat install PREFIX=/usr/local && \
-    cd .. && rm -rf ocrd_fileformat/
-
-
-# Default command
-CMD ['ocrd-fileformat-transform']
--- a/39
+++ b/39
@ -1,39 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG OCRD_OLENA_VERSION="1.3.0"
-
-
-# Build ocrd_olena
-RUN apt-get update && \
-    apt-get install -y \
-      imagemagick \
-    && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Install olena from .deb
-RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1.0+ocrd-git+2-ubuntu22.04/olena-bin_2.1.0+ocrd-git+2_amd64.deb && \
-    dpkg -i --force-depends olena-bin_2.1.0+ocrd-git+2_amd64.deb && \
-    rm -f olena-bin_2.1.0+ocrd-git+2_amd64.deb && \
-    apt-get update && \
-    apt-get -f install -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/* && \
-    if ! scribo-cli sauvola --help >/dev/null 2>&1; then echo "Olena/scribo is not installed correctly" >&2; exit 1; fi
-RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
-   mkdir ocrd_olena && \
-   tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
-   cd ocrd_olena && \
-   sed -i 's/^install: deps/install:/' Makefile && \
-   ${PIP_INSTALL} ocrd && \
-   apt install xmlstarlet && \
-   make install PREFIX=/usr/local && \
-   cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
-
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ['ocrd-olena-binarize']
--- a/19
+++ b/19
@ -1,19 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG OCRD_SEGMENT_VERSION="0.1.22"
-
-
-# Build pip installable stuff
-RUN ${PIP_INSTALL} \
-# Now the real stuff:
-        "ocrd-segment == ${OCRD_SEGMENT_VERSION}"
-
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ["ocrd-segment-extract-regions"]
--- a/31
+++ b/31
@ -1,31 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG TESSDATA_BEST_VERSION="4.0.0"
-ARG OCRD_TESSEROCR_VERSION="0.17.0"
-ENV TESSDATA_PREFIX /usr/local/share/tessdata
-
-
-# Install Leptonica and Tesseract.
-# TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1,
-# alex-p has 4.1.3, but not for jammy.)
-# RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
-RUN apt-get update && \
-    apt-get install -y \
-        tesseract-ocr \
-        libtesseract-dev \
-    && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-
-# Build pip installable stuff
-RUN ${PIP_INSTALL} \
-    "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}"
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ["ocrd-tesserocr-recognize"]
--- a/18
+++ b/18
@ -1,18 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG OCRD_WRAP_VERSION="0.1.7"
-
-
-# Build pip installable stuff
-RUN ${PIP_INSTALL} \
-    "ocrd_wrap == ${OCRD_WRAP_VERSION}"
-
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ["ocrd-preprocess-image"]
--- a/20
+++ b/20
@ -1,20 +0,0 @@
-ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
-
-ARG PIP_INSTALL="pip install --no-cache-dir"
-ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6"
-
-
-# Build pip installable stuff
-RUN ${PIP_INSTALL} \
-    # https://github.com/qurator-spk/sbb_textline_detection/issues/50
-    "h5py < 3" \
-    https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
-
-
-# Check pip dependencies
-RUN pip check
-
-
-# Default command
-CMD ["ocrd-sbb-textline-detector"]
--- a/33
+++ b/33
@ -1,33 +0,0 @@
-#!/bin/bash
-set -e
-
-self=`realpath $0`
-self_dir=`dirname "$self"`
-
-
-
-if [ -n "$1" ]; then
-  sub_images=""
-  for arg in "$@"; do
-    arg_sub_image=`echo "$arg" | sed 's/Dockerfile-//'`
-    NL=$'\n'
-    sub_images+="$NL$arg_sub_image"
-  done
-else
-  sub_images=`ls -1 Dockerfile-core* | sed 's/Dockerfile-//'`
-  sub_images="$sub_images `ls -1 Dockerfile-* | sed 's/Dockerfile-//'`"
-fi
-echo "Building:"
-echo "$sub_images"
-echo
-
-
-# Update base images if we build a core image
-if echo "$sub_images" | grep -q core; then
-  docker pull ubuntu:22.04
-  docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
-fi
-
-for sub_image in $sub_images; do
-  docker build --cache-from=quratorspk/ocrd-galley-$sub_image -t quratorspk/ocrd-galley-$sub_image -f Dockerfile-$sub_image .
-done
--- a/test-ocrd_tesserocr.sh
+++ b/test-ocrd_tesserocr.sh
@ -5,6 +5,8 @@ test_id=`basename $0`
 cd `mktemp -d /tmp/$test_id-XXXXX`

 # Prepare processors
+ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
+ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
 ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata

 # Prepare test workspace
--- a/test-sbb_binarization.sh
+++ b/test-sbb_binarization.sh
@ -5,7 +5,7 @@ test_id=`basename $0`
 cd `mktemp -d /tmp/$test_id-XXXXX`

 # Prepare processors
-ocrd resmgr download ocrd-sbb-binarize default-2021-03-09
+ocrd resmgr download ocrd-sbb-binarize default

 # Prepare test workspace
 wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
@ -13,4 +13,4 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip
 cd actevedef_718448162.first-page+binarization+segmentation

 # Run tests
-ocrd-sbb-binarize -P model default-2021-03-09 -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE
+ocrd-sbb-binarize -P model default -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE
--- a/test-sbb_textline_detector.sh
+++ b/test-sbb_textline_detector.sh
@ -1,16 +0,0 @@
-#!/bin/sh
-set -ex
-
-test_id=`basename $0`
-cd `mktemp -d /tmp/$test_id-XXXXX`
-
-# Prepare processors
-ocrd resmgr download ocrd-sbb-textline-detector default
-
-# Prepare test workspace
-wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
-unzip actevedef_718448162.first-page+binarization+segmentation.zip
-cd actevedef_718448162.first-page+binarization+segmentation
-
-# Run tests
-ocrd-sbb-textline-detector -P models default -I OCR-D-IMG-BIN -O TEST-EYNOLLAH-SEG
--- a/wrapper/qurator/ocrd_galley/cli.py
+++ b/wrapper/qurator/ocrd_galley/cli.py
@ -5,10 +5,8 @@ import colorama
 from pathlib import Path
 from termcolor import colored

-from .sub_images import sub_images

-DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley")
-DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest")
+DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum")  # TODO rename
 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")

 # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
@ -26,22 +24,9 @@ def main():
    argv = sys.argv.copy()
    argv[0] = os.path.basename(argv[0])

-    # If we're running ocrd resmgr download we need to run the correct subimage.
-    if argv[:3] == ["ocrd", "resmgr", "download"] or \
-       argv[:3] == ["ocrd", "resmgr", "list-available"]:
-        # Default to the base image
-        sub_image = sub_images[argv[0]]
-        # But look for a match of the executable
-        for x in argv[3:]:
-            if x in sub_images:
-                sub_image = sub_images[x]
-                break
-    else:
-        sub_image = sub_images[argv[0]]
+    docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, )

-    docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG)
-
-    if DOCKER_IMAGE_TAG != "latest":
+    if DOCKER_IMAGE_TAG != "maximum":
        print(colored(f"Using {docker_image}", 'red'))
    docker_run(argv, docker_image)

@ -50,6 +35,7 @@ def docker_run(argv, docker_image):
    docker_run_options = []
    docker_run_options.extend(["--rm", "-t"])
    docker_run_options.extend(["--mount", "type=bind,src=%s,target=/data" % os.getcwd()])
+    docker_run_options.extend(["--mount", "type=tmpfs,target=/tmp"])
    docker_run_options.extend(["--user", "%s:%s" % (os.getuid(), os.getgid())])
    docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL])
    docker_run_options.extend(["-e", "_OCRD_COMPLETE"])
--- a/wrapper/qurator/ocrd_galley/sub_images.py
+++ b/wrapper/qurator/ocrd_galley/sub_images.py
@ -1,3 +1,4 @@
+# TODO is a list now, basically (no more sub images)
 sub_images = {
        "ocrd": "core",
        "ocrd-olena-binarize": "ocrd_olena",