From 699023c0843cd38dcc48f043c599242c594b3f28 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 25 Apr 2024 16:21:25 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20WIP:=20Migrate=20to=20using=20oc?= =?UTF-8?q?rd:all=20image=20+=20Update=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile-core | 70 ----------------------- Dockerfile-dinglehopper | 18 ------ Dockerfile-eynollah | 18 ------ Dockerfile-ocrd_fileformat | 24 -------- Dockerfile-ocrd_olena | 39 ------------- Dockerfile-ocrd_segment | 19 ------ Dockerfile-ocrd_tesserocr | 31 ---------- Dockerfile-ocrd_wrap | 18 ------ Dockerfile-sbb_textline_detector | 20 ------- build | 33 ----------- test-ocrd_tesserocr.sh | 2 + test-sbb_binarization.sh | 4 +- test-sbb_textline_detector.sh | 16 ------ wrapper/qurator/ocrd_galley/cli.py | 24 ++------ wrapper/qurator/ocrd_galley/sub_images.py | 1 + 15 files changed, 10 insertions(+), 327 deletions(-) delete mode 100644 Dockerfile-core delete mode 100644 Dockerfile-dinglehopper delete mode 100644 Dockerfile-eynollah delete mode 100644 Dockerfile-ocrd_fileformat delete mode 100644 Dockerfile-ocrd_olena delete mode 100644 Dockerfile-ocrd_segment delete mode 100644 Dockerfile-ocrd_tesserocr delete mode 100644 Dockerfile-ocrd_wrap delete mode 100644 Dockerfile-sbb_textline_detector delete mode 100755 build delete mode 100755 test-sbb_textline_detector.sh diff --git a/Dockerfile-core b/Dockerfile-core deleted file mode 100644 index c655d56..0000000 --- a/Dockerfile-core +++ /dev/null @@ -1,70 +0,0 @@ -FROM ubuntu:22.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.47.0" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - build-essential \ - curl \ - git \ - xz-utils \ - pkg-config \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ -# pyenv builds -# TODO: builder container? - libz-dev \ - libssl-dev \ - libbz2-dev \ - liblzma-dev \ - libncurses-dev \ - libffi-dev \ - libreadline-dev \ - libsqlite3-dev \ - libmagic-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pyenv -# TODO: do not run as root -# TODO: does just saying "3.7" work as intended? -ENV HOME=/root -ENV PYENV_ROOT=/usr/local/share/pyenv -ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH -RUN \ - git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ - pyenv install 3.7 && \ - pyenv global 3.7 && \ - pyenv rehash && \ - pip install -U pip wheel && \ - pip install setuptools - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] diff --git a/Dockerfile-dinglehopper b/Dockerfile-dinglehopper deleted file mode 100644 index 765a1f2..0000000 --- a/Dockerfile-dinglehopper +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG DINGLEHOPPER_VERSION="0.9.2" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "dinglehopper == $DINGLEHOPPER_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-dinglehopper"] diff --git a/Dockerfile-eynollah b/Dockerfile-eynollah deleted file mode 100644 index 6505174..0000000 --- a/Dockerfile-eynollah +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG EYNOLLAH_VERSION="0.3.0" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "eynollah == ${EYNOLLAH_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-eynollah-segment"] diff --git a/Dockerfile-ocrd_fileformat b/Dockerfile-ocrd_fileformat deleted file mode 100644 index 060f79c..0000000 --- a/Dockerfile-ocrd_fileformat +++ /dev/null @@ -1,24 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_FILEFORMAT_VERSION="0.5.0" - - -RUN apt-get update && \ - apt-get install -y \ - git \ - openjdk-11-jdk-headless \ - wget \ - unzip \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* -RUN git clone --depth 1 --branch v${OCRD_FILEFORMAT_VERSION} https://github.com/OCR-D/ocrd_fileformat.git && \ - cd ocrd_fileformat/ && \ - git submodule update --init && \ - make install-fileformat install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_fileformat/ - - -# Default command -CMD ['ocrd-fileformat-transform'] diff --git a/Dockerfile-ocrd_olena b/Dockerfile-ocrd_olena deleted file mode 100644 index 29be067..0000000 --- a/Dockerfile-ocrd_olena +++ /dev/null @@ -1,39 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_OLENA_VERSION="1.3.0" - - -# Build ocrd_olena -RUN apt-get update && \ - apt-get install -y \ - imagemagick \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -# Install olena from .deb -RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1.0+ocrd-git+2-ubuntu22.04/olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - dpkg -i --force-depends olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - rm -f olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - apt-get update && \ - apt-get -f install -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* && \ - if ! scribo-cli sauvola --help >/dev/null 2>&1; then echo "Olena/scribo is not installed correctly" >&2; exit 1; fi -RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ - mkdir ocrd_olena && \ - tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ - cd ocrd_olena && \ - sed -i 's/^install: deps/install:/' Makefile && \ - ${PIP_INSTALL} ocrd && \ - apt install xmlstarlet && \ - make install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ['ocrd-olena-binarize'] diff --git a/Dockerfile-ocrd_segment b/Dockerfile-ocrd_segment deleted file mode 100644 index 284f45d..0000000 --- a/Dockerfile-ocrd_segment +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_SEGMENT_VERSION="0.1.22" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "ocrd-segment == ${OCRD_SEGMENT_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-segment-extract-regions"] diff --git a/Dockerfile-ocrd_tesserocr b/Dockerfile-ocrd_tesserocr deleted file mode 100644 index c046cfc..0000000 --- a/Dockerfile-ocrd_tesserocr +++ /dev/null @@ -1,31 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG TESSDATA_BEST_VERSION="4.0.0" -ARG OCRD_TESSEROCR_VERSION="0.17.0" -ENV TESSDATA_PREFIX /usr/local/share/tessdata - - -# Install Leptonica and Tesseract. -# TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1, -# alex-p has 4.1.3, but not for jammy.) -# RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ -RUN apt-get update && \ - apt-get install -y \ - tesseract-ocr \ - libtesseract-dev \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}" - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-tesserocr-recognize"] diff --git a/Dockerfile-ocrd_wrap b/Dockerfile-ocrd_wrap deleted file mode 100644 index 518d306..0000000 --- a/Dockerfile-ocrd_wrap +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_WRAP_VERSION="0.1.7" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_wrap == ${OCRD_WRAP_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-preprocess-image"] diff --git a/Dockerfile-sbb_textline_detector b/Dockerfile-sbb_textline_detector deleted file mode 100644 index 0569ab8..0000000 --- a/Dockerfile-sbb_textline_detector +++ /dev/null @@ -1,20 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - # https://github.com/qurator-spk/sbb_textline_detection/issues/50 - "h5py < 3" \ - https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-sbb-textline-detector"] diff --git a/build b/build deleted file mode 100755 index 96835cb..0000000 --- a/build +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -e - -self=`realpath $0` -self_dir=`dirname "$self"` - - - -if [ -n "$1" ]; then - sub_images="" - for arg in "$@"; do - arg_sub_image=`echo "$arg" | sed 's/Dockerfile-//'` - NL=$'\n' - sub_images+="$NL$arg_sub_image" - done -else - sub_images=`ls -1 Dockerfile-core* | sed 's/Dockerfile-//'` - sub_images="$sub_images `ls -1 Dockerfile-* | sed 's/Dockerfile-//'`" -fi -echo "Building:" -echo "$sub_images" -echo - - -# Update base images if we build a core image -if echo "$sub_images" | grep -q core; then - docker pull ubuntu:22.04 - docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 -fi - -for sub_image in $sub_images; do - docker build --cache-from=quratorspk/ocrd-galley-$sub_image -t quratorspk/ocrd-galley-$sub_image -f Dockerfile-$sub_image . -done diff --git a/test-ocrd_tesserocr.sh b/test-ocrd_tesserocr.sh index 3cd403c..007698d 100755 --- a/test-ocrd_tesserocr.sh +++ b/test-ocrd_tesserocr.sh @@ -5,6 +5,8 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors +ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata +ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata # Prepare test workspace diff --git a/test-sbb_binarization.sh b/test-sbb_binarization.sh index cc0afa9..4ecfbbf 100755 --- a/test-sbb_binarization.sh +++ b/test-sbb_binarization.sh @@ -5,7 +5,7 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors -ocrd resmgr download ocrd-sbb-binarize default-2021-03-09 +ocrd resmgr download ocrd-sbb-binarize default # Prepare test workspace wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip @@ -13,4 +13,4 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-sbb-binarize -P model default-2021-03-09 -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE +ocrd-sbb-binarize -P model default -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE diff --git a/test-sbb_textline_detector.sh b/test-sbb_textline_detector.sh deleted file mode 100755 index 9f940f1..0000000 --- a/test-sbb_textline_detector.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -set -ex - -test_id=`basename $0` -cd `mktemp -d /tmp/$test_id-XXXXX` - -# Prepare processors -ocrd resmgr download ocrd-sbb-textline-detector default - -# Prepare test workspace -wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip -unzip actevedef_718448162.first-page+binarization+segmentation.zip -cd actevedef_718448162.first-page+binarization+segmentation - -# Run tests -ocrd-sbb-textline-detector -P models default -I OCR-D-IMG-BIN -O TEST-EYNOLLAH-SEG diff --git a/wrapper/qurator/ocrd_galley/cli.py b/wrapper/qurator/ocrd_galley/cli.py index 9423c61..0263f42 100644 --- a/wrapper/qurator/ocrd_galley/cli.py +++ b/wrapper/qurator/ocrd_galley/cli.py @@ -5,10 +5,8 @@ import colorama from pathlib import Path from termcolor import colored -from .sub_images import sub_images -DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley") -DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest") +DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum") # TODO rename LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler @@ -26,22 +24,9 @@ def main(): argv = sys.argv.copy() argv[0] = os.path.basename(argv[0]) - # If we're running ocrd resmgr download we need to run the correct subimage. - if argv[:3] == ["ocrd", "resmgr", "download"] or \ - argv[:3] == ["ocrd", "resmgr", "list-available"]: - # Default to the base image - sub_image = sub_images[argv[0]] - # But look for a match of the executable - for x in argv[3:]: - if x in sub_images: - sub_image = sub_images[x] - break - else: - sub_image = sub_images[argv[0]] - - docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG) - - if DOCKER_IMAGE_TAG != "latest": + docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, ) + + if DOCKER_IMAGE_TAG != "maximum": print(colored(f"Using {docker_image}", 'red')) docker_run(argv, docker_image) @@ -50,6 +35,7 @@ def docker_run(argv, docker_image): docker_run_options = [] docker_run_options.extend(["--rm", "-t"]) docker_run_options.extend(["--mount", "type=bind,src=%s,target=/data" % os.getcwd()]) + docker_run_options.extend(["--mount", "type=tmpfs,target=/tmp"]) docker_run_options.extend(["--user", "%s:%s" % (os.getuid(), os.getgid())]) docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL]) docker_run_options.extend(["-e", "_OCRD_COMPLETE"]) diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py index aaea945..f2f4ae1 100644 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ b/wrapper/qurator/ocrd_galley/sub_images.py @@ -1,3 +1,4 @@ +# TODO is a list now, basically (no more sub images) sub_images = { "ocrd": "core", "ocrd-olena-binarize": "ocrd_olena",