diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6b8710a --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.git diff --git a/Dockerfile-core b/Dockerfile-core deleted file mode 100644 index c655d56..0000000 --- a/Dockerfile-core +++ /dev/null @@ -1,70 +0,0 @@ -FROM ubuntu:22.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.47.0" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - build-essential \ - curl \ - git \ - xz-utils \ - pkg-config \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ -# pyenv builds -# TODO: builder container? - libz-dev \ - libssl-dev \ - libbz2-dev \ - liblzma-dev \ - libncurses-dev \ - libffi-dev \ - libreadline-dev \ - libsqlite3-dev \ - libmagic-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pyenv -# TODO: do not run as root -# TODO: does just saying "3.7" work as intended? -ENV HOME=/root -ENV PYENV_ROOT=/usr/local/share/pyenv -ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH -RUN \ - git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ - pyenv install 3.7 && \ - pyenv global 3.7 && \ - pyenv rehash && \ - pip install -U pip wheel && \ - pip install setuptools - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] diff --git a/Dockerfile-core-cuda12.1 b/Dockerfile-core-cuda12.1 deleted file mode 100644 index c494a2c..0000000 --- a/Dockerfile-core-cuda12.1 +++ /dev/null @@ -1,70 +0,0 @@ -FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.47.0" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - build-essential \ - curl \ - git \ - xz-utils \ - pkg-config \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ -# pyenv builds -# TODO: builder container? - libz-dev \ - libssl-dev \ - libbz2-dev \ - liblzma-dev \ - libncurses-dev \ - libffi-dev \ - libreadline-dev \ - libsqlite3-dev \ - libmagic-dev \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pyenv -# TODO: do not run as root -# TODO: does just saying "3.7" work as intended? -ENV HOME=/root -ENV PYENV_ROOT=/usr/local/share/pyenv -ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH -RUN \ - git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ - pyenv install 3.7 && \ - pyenv global 3.7 && \ - pyenv rehash && \ - pip install -U pip wheel && \ - pip install setuptools - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] diff --git a/Dockerfile-dinglehopper b/Dockerfile-dinglehopper deleted file mode 100644 index 765a1f2..0000000 --- a/Dockerfile-dinglehopper +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG DINGLEHOPPER_VERSION="0.9.2" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "dinglehopper == $DINGLEHOPPER_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-dinglehopper"] diff --git a/Dockerfile-eynollah b/Dockerfile-eynollah deleted file mode 100644 index 6505174..0000000 --- a/Dockerfile-eynollah +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG EYNOLLAH_VERSION="0.3.0" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "eynollah == ${EYNOLLAH_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-eynollah-segment"] diff --git a/Dockerfile-ocrd_anybaseocr b/Dockerfile-ocrd_anybaseocr deleted file mode 100644 index 6ce5d0e..0000000 --- a/Dockerfile-ocrd_anybaseocr +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_ANYBASEOCR_VERSION="1.8.2" - - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_anybaseocr == ${OCRD_ANYBASEOCR_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-anybaseocr-crop"] diff --git a/Dockerfile-ocrd_calamari b/Dockerfile-ocrd_calamari deleted file mode 100644 index 3b9d9cc..0000000 --- a/Dockerfile-ocrd_calamari +++ /dev/null @@ -1,20 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - - -# XXX https://github.com/OCR-D/core/issues/642 -#ARG PIP_INSTALL="pip install --no-cache-dir" -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_CALAMARI_VERSION="1.0.5" - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_calamari == $OCRD_CALAMARI_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-calamari-recognize"] diff --git a/Dockerfile-ocrd_calamari03 b/Dockerfile-ocrd_calamari03 deleted file mode 100644 index 5a8be3d..0000000 --- a/Dockerfile-ocrd_calamari03 +++ /dev/null @@ -1,22 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Resolve conflicts early: - 'tensorflow-gpu == 1.15.*' \ - 'calamari-ocr == 0.3.5' \ -# Now the real stuff: - 'ocrd_calamari == 0.0.7' - - -# Check pip dependencies -RUN pip check - - -# Default command -RUN ln -s ocrd-calamari-recognize /usr/local/bin/ocrd-calamari-recognize03 -CMD ["ocrd-calamari-recognize"] diff --git a/Dockerfile-ocrd_cis b/Dockerfile-ocrd_cis deleted file mode 100644 index e967893..0000000 --- a/Dockerfile-ocrd_cis +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_CIS_VERSION="0.1.5" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "https://github.com/cisocrgroup/ocrd_cis/archive/v${OCRD_CIS_VERSION}.tar.gz" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-cis-ocropy-segment"] diff --git a/Dockerfile-ocrd_fileformat b/Dockerfile-ocrd_fileformat deleted file mode 100644 index 060f79c..0000000 --- a/Dockerfile-ocrd_fileformat +++ /dev/null @@ -1,24 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_FILEFORMAT_VERSION="0.5.0" - - -RUN apt-get update && \ - apt-get install -y \ - git \ - openjdk-11-jdk-headless \ - wget \ - unzip \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* -RUN git clone --depth 1 --branch v${OCRD_FILEFORMAT_VERSION} https://github.com/OCR-D/ocrd_fileformat.git && \ - cd ocrd_fileformat/ && \ - git submodule update --init && \ - make install-fileformat install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_fileformat/ - - -# Default command -CMD ['ocrd-fileformat-transform'] diff --git a/Dockerfile-ocrd_olena b/Dockerfile-ocrd_olena deleted file mode 100644 index 29be067..0000000 --- a/Dockerfile-ocrd_olena +++ /dev/null @@ -1,39 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_OLENA_VERSION="1.3.0" - - -# Build ocrd_olena -RUN apt-get update && \ - apt-get install -y \ - imagemagick \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -# Install olena from .deb -RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1.0+ocrd-git+2-ubuntu22.04/olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - dpkg -i --force-depends olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - rm -f olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ - apt-get update && \ - apt-get -f install -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* && \ - if ! scribo-cli sauvola --help >/dev/null 2>&1; then echo "Olena/scribo is not installed correctly" >&2; exit 1; fi -RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ - mkdir ocrd_olena && \ - tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ - cd ocrd_olena && \ - sed -i 's/^install: deps/install:/' Makefile && \ - ${PIP_INSTALL} ocrd && \ - apt install xmlstarlet && \ - make install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ['ocrd-olena-binarize'] diff --git a/Dockerfile-ocrd_segment b/Dockerfile-ocrd_segment deleted file mode 100644 index 284f45d..0000000 --- a/Dockerfile-ocrd_segment +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_SEGMENT_VERSION="0.1.22" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "ocrd-segment == ${OCRD_SEGMENT_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-segment-extract-regions"] diff --git a/Dockerfile-ocrd_tesserocr b/Dockerfile-ocrd_tesserocr deleted file mode 100644 index c046cfc..0000000 --- a/Dockerfile-ocrd_tesserocr +++ /dev/null @@ -1,31 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG TESSDATA_BEST_VERSION="4.0.0" -ARG OCRD_TESSEROCR_VERSION="0.17.0" -ENV TESSDATA_PREFIX /usr/local/share/tessdata - - -# Install Leptonica and Tesseract. -# TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1, -# alex-p has 4.1.3, but not for jammy.) -# RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ -RUN apt-get update && \ - apt-get install -y \ - tesseract-ocr \ - libtesseract-dev \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}" - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-tesserocr-recognize"] diff --git a/Dockerfile-ocrd_trocr b/Dockerfile-ocrd_trocr index fc05759..ef4fe15 100644 --- a/Dockerfile-ocrd_trocr +++ b/Dockerfile-ocrd_trocr @@ -1,8 +1,7 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT +FROM ocrd/all:maximum -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_TROCR_COMMIT="250ff1c" +ARG PIP_INSTALL="pip3 install --no-cache-dir" +ARG OCRD_TROCR_COMMIT="30696cb" # Build pip installable stuff diff --git a/Dockerfile-ocrd_wrap b/Dockerfile-ocrd_wrap deleted file mode 100644 index 518d306..0000000 --- a/Dockerfile-ocrd_wrap +++ /dev/null @@ -1,18 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_WRAP_VERSION="0.1.7" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - "ocrd_wrap == ${OCRD_WRAP_VERSION}" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-preprocess-image"] diff --git a/Dockerfile-sbb_binarization b/Dockerfile-sbb_binarization deleted file mode 100644 index a80aea0..0000000 --- a/Dockerfile-sbb_binarization +++ /dev/null @@ -1,19 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG SBB_BINARIZATION_VERSION="0.1.0" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ -# Now the real stuff: - "sbb_binarization == $SBB_BINARIZATION_VERSION" - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-sbb-binarize"] diff --git a/Dockerfile-sbb_textline_detector b/Dockerfile-sbb_textline_detector deleted file mode 100644 index 0569ab8..0000000 --- a/Dockerfile-sbb_textline_detector +++ /dev/null @@ -1,20 +0,0 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6" - - -# Build pip installable stuff -RUN ${PIP_INSTALL} \ - # https://github.com/qurator-spk/sbb_textline_detection/issues/50 - "h5py < 3" \ - https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz - - -# Check pip dependencies -RUN pip check - - -# Default command -CMD ["ocrd-sbb-textline-detector"] diff --git a/build b/build deleted file mode 100755 index 96835cb..0000000 --- a/build +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -e - -self=`realpath $0` -self_dir=`dirname "$self"` - - - -if [ -n "$1" ]; then - sub_images="" - for arg in "$@"; do - arg_sub_image=`echo "$arg" | sed 's/Dockerfile-//'` - NL=$'\n' - sub_images+="$NL$arg_sub_image" - done -else - sub_images=`ls -1 Dockerfile-core* | sed 's/Dockerfile-//'` - sub_images="$sub_images `ls -1 Dockerfile-* | sed 's/Dockerfile-//'`" -fi -echo "Building:" -echo "$sub_images" -echo - - -# Update base images if we build a core image -if echo "$sub_images" | grep -q core; then - docker pull ubuntu:22.04 - docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 -fi - -for sub_image in $sub_images; do - docker build --cache-from=quratorspk/ocrd-galley-$sub_image -t quratorspk/ocrd-galley-$sub_image -f Dockerfile-$sub_image . -done diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..977e460 --- /dev/null +++ b/build.sh @@ -0,0 +1,4 @@ +#!/bin/sh +set -ex + +docker build . -t ocrd_trocr:latest -f Dockerfile-ocrd_trocr diff --git a/my_ocrd_workflow b/extra/my_ocrd_workflow similarity index 100% rename from my_ocrd_workflow rename to extra/my_ocrd_workflow diff --git a/my_ocrd_workflow-sbb b/extra/my_ocrd_workflow-sbb similarity index 100% rename from my_ocrd_workflow-sbb rename to extra/my_ocrd_workflow-sbb diff --git a/ocrd-workspace-from-images b/extra/ocrd-workspace-from-images similarity index 100% rename from ocrd-workspace-from-images rename to extra/ocrd-workspace-from-images diff --git a/ppn2ocr b/extra/ppn2ocr similarity index 100% rename from ppn2ocr rename to extra/ppn2ocr diff --git a/requirements-ppn2ocr.txt b/extra/requirements-ppn2ocr.txt similarity index 100% rename from requirements-ppn2ocr.txt rename to extra/requirements-ppn2ocr.txt diff --git a/zdb2ocr b/extra/zdb2ocr similarity index 100% rename from zdb2ocr rename to extra/zdb2ocr diff --git a/qurator_data_lib.sh b/qurator_data_lib.sh deleted file mode 100644 index 704d54d..0000000 --- a/qurator_data_lib.sh +++ /dev/null @@ -1,156 +0,0 @@ -# ______________________________________ -#/ always copy the file from \ -#| mono-repo/qurator_data_lib.sh, never | -#\ edit the copy in the project / -# -------------------------------------- -# \ ^__^ -# \ (oo)\_______ -# (__)\ )\/\ -# ||----w | -# || || - -if [ -z "$BASH" ]; then - echo "qurator_data_lib.sh uses bash features, please make sure to run $0 in bash" - exit 1 -fi - -check_data_subdir() { - result=0 - - if git submodule status $DATA_SUBDIR | grep -q '^-'; then - echo "$DATA_SUBDIR/ is not an initialized submodule"; result=1 - fi - if ! [ -e $DATA_SUBDIR/.git/annex ]; then - echo "$DATA_SUBDIR/ is not a git annex repository"; result=1 - fi - if ! (cd $DATA_SUBDIR && git annex version | egrep -q 'local repository version: (7|8)'); then - echo "$DATA_SUBDIR/ is not a git annex repository version 7 or 8"; result=1 - fi - if ! (cd $DATA_SUBDIR && git remote | grep -q '^nfs$'); then - echo "$DATA_SUBDIR/ has no git remote 'nfs'"; result=1 - fi - - return $result -} - -annex_get() { - if [[ "$1" = '--allow_symlinks' ]]; then - allow_symlinks=1 - shift - else - allow_symlinks=0 - fi - file_pattern="$1" - - ( - cd $DATA_SUBDIR - git annex get $file_pattern - - # fsck seems to be necessary to fix the files if we are in a submodule - git annex fsck $file_pattern - - # Check that there are no symlinks = only unlocked files. This is needed for - # Docker builds, as we cannot dereference symlinks in a Dockerfile COPY. - if [[ $allow_symlinks = 0 ]]; then - git ls-files $file_pattern | while read f; do - if ! [[ -f "$f" ]]; then - echo "$DATA_SUBDIR/$f is not a regular file – Is an unlock needed?" - exit - fi - done - fi - ) -} - -# Options: -# --no-unpack Do NOT unpack the file -# --strip-components NUMBER (as tar's option) -download_to() { - unpack=1 - tar_options="" - - _options=$(getopt --long no-unpack,strip-components: -- "" "$@") - if [[ $? != 0 ]]; then - echo "Bad parameters for download_to" >&2 - exit 1 - fi - eval set -- "$_options" - while true; do - case "$1" in - --no-unpack) - unpack=0 - ;; - --strip-components) - shift - components=$1 - tar_options="$tar_options --strip-components $components" - ;; - --) - shift - break - ;; - esac - shift - done - - download_source="$1" - dest="$2" - - ( - cd $DATA_SUBDIR - tmpf=`mktemp 'tmp.XXXXXX'` - curl -sSL -o $tmpf "$download_source" - if [[ $unpack = 1 ]]; then - mkdir -p "$dest" - # Unpacking relies on tar -a unpacking any tar compression - tar -C "$dest" $tar_options -af $tmpf -xv - rm -f $tmpf - else - dest_dir=`dirname "$dest"` - mkdir -p "$dest_dir" - mv $tmpf "$dest" - fi - ) -} - -suggest_commands() { - echo "Suggested commands:" - echo - echo "git submodule update --init" - echo "(cd $DATA_SUBDIR && git annex init --version=7)" - echo "(cd $DATA_SUBDIR && git remote add nfs annex@b-lx0053.sbb.spk-berlin.de:/var/lib/annex/qurator-data.git)" -} - -handle_data() { - if [[ "$1" = '--no-download' ]]; then - no_download=1 - shift - else - no_download=0 - fi - - if [ -n "$FORCE_DOWNLOAD" ]; then - get_from_web - elif ! check_data_subdir; then - if [[ $no_download = 1 ]]; then - select choice in "Abort to manually fix $DATA_SUBDIR submodule"; do - if [ $REPLY = 1 ]; then - suggest_commands - exit - fi - done - else - select choice in "Abort to manually fix $DATA_SUBDIR submodule" "Download data files from the web"; do - if [ $REPLY = 1 ]; then - suggest_commands - exit - else - get_from_web - break - fi - done - fi - else - get_from_annex - fi -} diff --git a/test-core-cuda12.1.sh b/test-core-cuda12.1.sh deleted file mode 120000 index 2ee3591..0000000 --- a/test-core-cuda12.1.sh +++ /dev/null @@ -1 +0,0 @@ -test-core.sh \ No newline at end of file diff --git a/test-ocrd_cis.sh b/test-ocrd_cis.sh deleted file mode 100755 index 39afd91..0000000 --- a/test-ocrd_cis.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -set -ex - -test_id=`basename $0` -cd `mktemp -d /tmp/$test_id-XXXXX` - -# Prepare test workspace -wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip -unzip actevedef_718448162.first-page+binarization+segmentation.zip -cd actevedef_718448162.first-page+binarization+segmentation - -# Run tests -ocrd-cis-ocropy-segment -I OCR-D-IMG-BIN -O TEST-CIS-OCRPY-SEGMENT -# TODO -recognize diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..f588bd6 --- /dev/null +++ b/test.sh @@ -0,0 +1,24 @@ +#!/bin/sh +set -e + +count_ok=0 +count_failed=0 + +for test in tests/*.sh; do + echo "== $test" + $test && result=$? || result=$? + + if [[ $result = 0 ]]; then + echo "✔" + count_ok=$((count_ok+1)) + else + echo "❌" + count_failed=$((count_failed+1)) + fi + echo +done + +echo "$count_ok ok, $count_failed failed" +if [[ $count_failed -gt 0 ]]; then + exit 1 +fi diff --git a/test-core.sh b/tests/test-core.sh similarity index 100% rename from test-core.sh rename to tests/test-core.sh diff --git a/test-dinglehopper.sh b/tests/test-dinglehopper.sh similarity index 100% rename from test-dinglehopper.sh rename to tests/test-dinglehopper.sh diff --git a/test-eynollah.sh b/tests/test-eynollah.sh similarity index 100% rename from test-eynollah.sh rename to tests/test-eynollah.sh diff --git a/tests/test-ocrd_anybaseocr.sh b/tests/test-ocrd_anybaseocr.sh new file mode 100755 index 0000000..6d44615 --- /dev/null +++ b/tests/test-ocrd_anybaseocr.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -ex + +test_id=`basename $0` +cd `mktemp -d /tmp/$test_id-XXXXX` + +# Prepare processors + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page.zip +unzip actevedef_718448162.first-page.zip +cd actevedef_718448162.first-page + +# Run tests +ocrd-anybaseocr-binarize -I OCR-D-IMG -O OCR-D-BIN -P operation_level page -P threshold 0.3 +ocrd-anybaseocr-deskew -I OCR-D-BIN -O OCR-D-DESKEW -P maxskew 5.0 -P skewsteps 20 -P operation_level page diff --git a/test-sbb_binarization.sh b/tests/test-ocrd_calamari.sh similarity index 67% rename from test-sbb_binarization.sh rename to tests/test-ocrd_calamari.sh index cc0afa9..1726e26 100755 --- a/test-sbb_binarization.sh +++ b/tests/test-ocrd_calamari.sh @@ -5,7 +5,7 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors -ocrd resmgr download ocrd-sbb-binarize default-2021-03-09 +ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 # Prepare test workspace wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip @@ -13,4 +13,4 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-sbb-binarize -P model default-2021-03-09 -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE +ocrd-calamari-recognize -I OCR-D-SEG-LINE-SBB -O OCR-D-OCR-CALA -P checkpoint_dir qurator-gt4histocr-1.0 diff --git a/tests/test-ocrd_cis.sh b/tests/test-ocrd_cis.sh new file mode 100755 index 0000000..f998df2 --- /dev/null +++ b/tests/test-ocrd_cis.sh @@ -0,0 +1,30 @@ +#!/bin/sh +set -ex + +test_id=`basename $0` +cd `mktemp -d /tmp/$test_id-XXXXX` + +OCRD_CIS_OCROPY_MODEL=fraktur.pyrnn.gz + +# Prepare processors +ocrd resmgr download ocrd-cis-ocropy-recognize $OCRD_CIS_OCROPY_MODEL + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page.zip +unzip actevedef_718448162.first-page.zip +cd actevedef_718448162.first-page + +# XXX ocrd-cis-ocropy-segment wasn't happy with the binarized input (no +# "binarized" AlternativeImage?!), so we do it here again +ocrd-skimage-binarize -I OCR-D-IMG -O OCR-D-IMG-BIN + +# Run tests +ocrd-cis-ocropy-segment \ + -I OCR-D-IMG-BIN -O TEST-CIS-OCROPY-SEG-LINE \ + -P level-of-operation page +test "$(grep TextLine TEST-CIS-OCROPY-SEG-LINE/*.xml | wc -l)" -gt 50 + +ocrd-cis-ocropy-recognize \ + -I TEST-CIS-OCROPY-SEG-LINE -O TEST-CIS-OCROPY-OCR \ + -P model $OCRD_CIS_OCROPY_MODEL +test "$(grep Unicode TEST-CIS-OCROPY-OCR/*.xml | wc -l)" -gt 50 diff --git a/test-ocrd_fileformat.sh b/tests/test-ocrd_fileformat.sh similarity index 100% rename from test-ocrd_fileformat.sh rename to tests/test-ocrd_fileformat.sh diff --git a/test-ocrd_olena.sh b/tests/test-ocrd_olena.sh similarity index 100% rename from test-ocrd_olena.sh rename to tests/test-ocrd_olena.sh diff --git a/test-ocrd_segment.sh b/tests/test-ocrd_segment.sh similarity index 100% rename from test-ocrd_segment.sh rename to tests/test-ocrd_segment.sh diff --git a/test-ocrd_tesserocr.sh b/tests/test-ocrd_tesserocr.sh similarity index 84% rename from test-ocrd_tesserocr.sh rename to tests/test-ocrd_tesserocr.sh index 3cd403c..007698d 100755 --- a/test-ocrd_tesserocr.sh +++ b/tests/test-ocrd_tesserocr.sh @@ -5,6 +5,8 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors +ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata +ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata # Prepare test workspace diff --git a/test-sbb_textline_detector.sh b/tests/test-ocrd_trocr.sh similarity index 72% rename from test-sbb_textline_detector.sh rename to tests/test-ocrd_trocr.sh index 9f940f1..68295a9 100755 --- a/test-sbb_textline_detector.sh +++ b/tests/test-ocrd_trocr.sh @@ -5,7 +5,6 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors -ocrd resmgr download ocrd-sbb-textline-detector default # Prepare test workspace wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip @@ -13,4 +12,6 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-sbb-textline-detector -P models default -I OCR-D-IMG-BIN -O TEST-EYNOLLAH-SEG +ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR + +# TODO Does not use a useful model, does not check that text was recognize diff --git a/test-ocrd_wrap.sh b/tests/test-ocrd_wrap.sh similarity index 100% rename from test-ocrd_wrap.sh rename to tests/test-ocrd_wrap.sh diff --git a/test-ocrd_trocr.sh b/tests/test-sbb_binarization.sh similarity index 75% rename from test-ocrd_trocr.sh rename to tests/test-sbb_binarization.sh index d04be15..4ecfbbf 100755 --- a/test-ocrd_trocr.sh +++ b/tests/test-sbb_binarization.sh @@ -5,6 +5,7 @@ test_id=`basename $0` cd `mktemp -d /tmp/$test_id-XXXXX` # Prepare processors +ocrd resmgr download ocrd-sbb-binarize default # Prepare test workspace wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip @@ -12,4 +13,4 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR +ocrd-sbb-binarize -P model default -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE diff --git a/wrapper/qurator/ocrd_galley/cli.py b/wrapper/qurator/ocrd_galley/cli.py index 9423c61..a024a60 100644 --- a/wrapper/qurator/ocrd_galley/cli.py +++ b/wrapper/qurator/ocrd_galley/cli.py @@ -5,10 +5,9 @@ import colorama from pathlib import Path from termcolor import colored -from .sub_images import sub_images +from .processor_images import processor_images + -DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley") -DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest") LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler @@ -26,22 +25,23 @@ def main(): argv = sys.argv.copy() argv[0] = os.path.basename(argv[0]) + # If we're running ocrd resmgr download we need to run the correct subimage. if argv[:3] == ["ocrd", "resmgr", "download"] or \ argv[:3] == ["ocrd", "resmgr", "list-available"]: # Default to the base image - sub_image = sub_images[argv[0]] + processor_image = processor_images[argv[0]] # But look for a match of the executable for x in argv[3:]: - if x in sub_images: - sub_image = sub_images[x] + if x in processor_images: + processor_image = processor_images[x] break else: - sub_image = sub_images[argv[0]] + processor_image = processor_images[argv[0]] - docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG) + docker_image = processor_image - if DOCKER_IMAGE_TAG != "latest": + if docker_image != "ocrd/all:maximum": print(colored(f"Using {docker_image}", 'red')) docker_run(argv, docker_image) @@ -50,6 +50,7 @@ def docker_run(argv, docker_image): docker_run_options = [] docker_run_options.extend(["--rm", "-t"]) docker_run_options.extend(["--mount", "type=bind,src=%s,target=/data" % os.getcwd()]) + docker_run_options.extend(["--mount", "type=tmpfs,target=/tmp"]) docker_run_options.extend(["--user", "%s:%s" % (os.getuid(), os.getgid())]) docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL]) docker_run_options.extend(["-e", "_OCRD_COMPLETE"]) diff --git a/wrapper/qurator/ocrd_galley/processor_images.py b/wrapper/qurator/ocrd_galley/processor_images.py new file mode 100644 index 0000000..da11bd0 --- /dev/null +++ b/wrapper/qurator/ocrd_galley/processor_images.py @@ -0,0 +1,45 @@ +processor_images = { + "ocrd": "ocrd/all:maximum", + "ocrd-olena-binarize": "ocrd/all:maximum", + "ocrd-sbb-binarize": "ocrd/all:maximum", + "ocrd-sbb-textline-detector": "ocrd/all:maximum", + "ocrd-calamari-recognize": "ocrd/all:maximum", + "ocrd-calamari-recognize03": "ocrd/all:maximum", + "ocrd-tesserocr-segment-region": "ocrd/all:maximum", + "ocrd-tesserocr-segment-line": "ocrd/all:maximum", + "ocrd-tesserocr-recognize": "ocrd/all:maximum", + "ocrd-dinglehopper": "ocrd/all:maximum", + "ocrd-cis-ocropy-clip": "ocrd/all:maximum", + "ocrd-cis-ocropy-resegment": "ocrd/all:maximum", + "ocrd-cis-ocropy-segment": "ocrd/all:maximum", + "ocrd-cis-ocropy-deskew": "ocrd/all:maximum", + "ocrd-cis-ocropy-denoise": "ocrd/all:maximum", + "ocrd-cis-ocropy-binarize": "ocrd/all:maximum", + "ocrd-cis-ocropy-dewarp": "ocrd/all:maximum", + "ocrd-cis-ocropy-recognize": "ocrd/all:maximum", + "ocrd-fileformat-transform": "ocrd/all:maximum", + "ocrd-segment-extract-pages": "ocrd/all:maximum", + "ocrd-segment-extract-regions": "ocrd/all:maximum", + "ocrd-segment-extract-lines": "ocrd/all:maximum", + "ocrd-segment-from-masks": "ocrd/all:maximum", + "ocrd-segment-from-coco": "ocrd/all:maximum", + "ocrd-segment-repair": "ocrd/all:maximum", + "ocrd-segment-evaluate": "ocrd/all:maximum", + "ocrd-preprocess-image": "ocrd/all:maximum", + "ocrd-skimage-normalize": "ocrd/all:maximum", + "ocrd-skimage-denoise-raw": "ocrd/all:maximum", + "ocrd-skimage-binarize": "ocrd/all:maximum", + "ocrd-skimage-denoise": "ocrd/all:maximum", + "ocrd-eynollah-segment": "ocrd/all:maximum", + "ocrd-anybaseocr-binarize": "ocrd/all:maximum", + "ocrd-anybaseocr-crop": "ocrd/all:maximum", + "ocrd-anybaseocr-deskew": "ocrd/all:maximum", + + # non OCR-D CLI + "ocr-transform": "ocrd/all:maximum", + "dinglehopper": "ocrd/all:maximum", + "dinglehopper-extract": "ocrd/all:maximum", + + # specialized images + "ocrd-trocr-recognize": "ocrd_trocr", +} diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py deleted file mode 100644 index aaea945..0000000 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ /dev/null @@ -1,40 +0,0 @@ -sub_images = { - "ocrd": "core", - "ocrd-olena-binarize": "ocrd_olena", - "ocrd-sbb-binarize": "sbb_binarization", - "ocrd-sbb-textline-detector": "sbb_textline_detector", - "ocrd-calamari-recognize": "ocrd_calamari", - "ocrd-calamari-recognize03": "ocrd_calamari03", - "ocrd-tesserocr-segment-region": "ocrd_tesserocr", - "ocrd-tesserocr-segment-line": "ocrd_tesserocr", - "ocrd-tesserocr-recognize": "ocrd_tesserocr", - "ocrd-dinglehopper": "dinglehopper", - "ocrd-cis-ocropy-clip": "ocrd_cis", - "ocrd-cis-ocropy-resegment": "ocrd_cis", - "ocrd-cis-ocropy-segment": "ocrd_cis", - "ocrd-cis-ocropy-deskew": "ocrd_cis", - "ocrd-cis-ocropy-denoise": "ocrd_cis", - "ocrd-cis-ocropy-binarize": "ocrd_cis", - "ocrd-cis-ocropy-dewarp": "ocrd_cis", - "ocrd-cis-ocropy-recognize": "ocrd_cis", - "ocrd-fileformat-transform": "ocrd_fileformat", - "ocrd-segment-extract-pages": "ocrd_segment", - "ocrd-segment-extract-regions": "ocrd_segment", - "ocrd-segment-extract-lines": "ocrd_segment", - "ocrd-segment-from-masks": "ocrd_segment", - "ocrd-segment-from-coco": "ocrd_segment", - "ocrd-segment-repair": "ocrd_segment", - "ocrd-segment-evaluate": "ocrd_segment", - "ocrd-preprocess-image": "ocrd_wrap", - "ocrd-skimage-normalize": "ocrd_wrap", - "ocrd-skimage-denoise-raw": "ocrd_wrap", - "ocrd-skimage-binarize": "ocrd_wrap", - "ocrd-skimage-denoise": "ocrd_wrap", - "ocrd-eynollah-segment": "eynollah", - "ocrd-anybaseocr-crop": "ocrd_anybaseocr", - "ocrd-anybaseocr-deskew": "ocrd_anybaseocr", - "ocrd-trocr-recognize": "ocrd_trocr", - - # non OCR-D CLI - "ocr-transform": "ocrd_fileformat", -}