diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index e786a5d..0000000 --- a/Dockerfile +++ /dev/null @@ -1,86 +0,0 @@ -FROM ubuntu:18.04 - -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - -ENV OCRD_OLENA_VERSION 1.2.0 -ENV TESSDATA_BEST_VERSION 4.0.0 -ENV TESSDATA_PREFIX /usr/local/share/tessdata - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - curl xz-utils \ - python3-pip \ - git \ - software-properties-common \ -# For clstm on Ubuntu 19.04: - swig libeigen3-dev libpng-dev libprotobuf-dev \ -# For cv2: - libsm6 libxrender1 \ -# For ocrd_olena: - imagemagick \ -# XML utils - libxml2-utils \ - xmlstarlet \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - - -# Install Leptonica and Tesseract. -RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ - apt-get update && \ - apt-get install -y \ - tesseract-ocr \ - libtesseract-dev \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -# Set up OCR-D logging -COPY ocrd_logging.py /etc/ - - -# Build ocrd_olena -# XXX .deb needs an update -RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \ - dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \ - rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \ - apt-get update && \ - apt-get -f install -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* -RUN pip3 install --no-cache-dir --upgrade pip && \ - curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ - mkdir ocrd_olena && \ - tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ - cd ocrd_olena && \ - sed -i 's/^install: deps$/install:/' Makefile && \ - pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \ - make install PREFIX=/usr/local && \ - cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz - - -# Copy OCR models -RUN mkdir -p /var/lib/calamari-models -COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR -RUN mkdir -p $TESSDATA_PREFIX -ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ -COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ -COPY data/textline_detection /var/lib/textline_detection - - -# Install requirements -# Using pipdeptree here to get more info than from pip3 check -COPY requirements.txt /tmp/ -RUN pip3 install --no-cache-dir --upgrade pip && \ - pip3 install --no-cache-dir --use-feature=2020-resolver -r /tmp/requirements.txt && \ - pip3 install --no-cache-dir pipdeptree && \ - pipdeptree -w fail - - -COPY my_ocrd_workflow /usr/bin/ -COPY xsd/* /usr/share/xml/ - - -WORKDIR /data -ENTRYPOINT ["/usr/bin/my_ocrd_workflow"] diff --git a/Dockerfile-boxed-base b/Dockerfile-boxed-base new file mode 100644 index 0000000..3fad690 --- /dev/null +++ b/Dockerfile-boxed-base @@ -0,0 +1,41 @@ +FROM ubuntu:18.04 + +ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 +ENV PIP_DEFAULT_TIMEOUT=120 + + +RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ + apt-get update && \ + apt-get install -y \ + curl xz-utils \ + python3-pip \ +# For add-apt-repository: + software-properties-common \ +# XML utils + libxml2-utils \ + xmlstarlet \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# Set up OCR-D logging +COPY ocrd_logging.py /etc/ + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir --upgrade pip && \ + pip3 install --no-cache-dir \ +# Resolve conflicts early: + 'setuptools >= 41.0.0' \ + 'ocrd >= 2.13.1' + + +# Check pip dependencies +RUN pip3 check + + +WORKDIR /data + +# Default command +CMD ['ocrd'] diff --git a/Dockerfile-boxed-dinglehopper b/Dockerfile-boxed-dinglehopper new file mode 100644 index 0000000..aa4749c --- /dev/null +++ b/Dockerfile-boxed-dinglehopper @@ -0,0 +1,18 @@ +FROM boxed-base + + +ENV DINGLEHOPPER_COMMIT 2b98f69 + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Now the real stuff: + https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-dinglehopper"] diff --git a/Dockerfile-boxed-ocrd_calamari b/Dockerfile-boxed-ocrd_calamari new file mode 100644 index 0000000..a64a0c1 --- /dev/null +++ b/Dockerfile-boxed-ocrd_calamari @@ -0,0 +1,24 @@ +FROM boxed-base + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Resolve conflicts early: + 'tensorflow-gpu == 1.15.*' \ + 'calamari-ocr == 0.3.5' \ +# Now the real stuff: + 'ocrd_calamari >= 0.0.7' + + +# Copy OCR models +RUN mkdir -p /var/lib/calamari-models +COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR + + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-calamari-recognize"] diff --git a/Dockerfile-boxed-ocrd_olena b/Dockerfile-boxed-ocrd_olena new file mode 100644 index 0000000..98f36bf --- /dev/null +++ b/Dockerfile-boxed-ocrd_olena @@ -0,0 +1,33 @@ +FROM boxed-base + +ENV OCRD_OLENA_VERSION 1.2.0 + +# Build ocrd_olena +RUN apt-get update && \ + apt-get install -y \ + imagemagick \ + && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \ + dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \ + rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \ + apt-get update && \ + apt-get -f install -y && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN pip3 install --no-cache-dir --upgrade pip && \ + curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ + mkdir ocrd_olena && \ + tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ + cd ocrd_olena && \ + sed -i 's/^install: deps$/install:/' Makefile && \ + pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \ + make install PREFIX=/usr/local && \ + cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ['ocrd-olena-binarize'] diff --git a/Dockerfile-boxed-ocrd_tesserocr b/Dockerfile-boxed-ocrd_tesserocr new file mode 100644 index 0000000..c0ca4ff --- /dev/null +++ b/Dockerfile-boxed-ocrd_tesserocr @@ -0,0 +1,35 @@ +FROM boxed-base + + +ENV TESSDATA_BEST_VERSION 4.0.0 +ENV TESSDATA_PREFIX /usr/local/share/tessdata + + +# Install Leptonica and Tesseract. +RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ + apt-get update && \ + apt-get install -y \ + tesseract-ocr \ + libtesseract-dev \ + && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + + +# Copy OCR models +RUN mkdir -p $TESSDATA_PREFIX +ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ +COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Now the real stuff: + 'ocrd_tesserocr >= 0.9.0' + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-tesserocr-recognize"] diff --git a/Dockerfile-boxed-sbb_textline_detector b/Dockerfile-boxed-sbb_textline_detector new file mode 100644 index 0000000..4274725 --- /dev/null +++ b/Dockerfile-boxed-sbb_textline_detector @@ -0,0 +1,22 @@ +FROM boxed-base + + +ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e + + +# Build pip installable stuff +RUN pip3 install --no-cache-dir \ +# Now the real stuff: + https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz + + +# Copy OCR models +COPY data/textline_detection /var/lib/textline_detection + + +# Check pip dependencies +RUN pip3 check + + +# Default command +CMD ["ocrd-sbb-textline-detector"] diff --git a/build b/build index 227a914..3a50a4d 100755 --- a/build +++ b/build @@ -21,4 +21,9 @@ get_from_web() { handle_data -docker build --cache-from my_ocrd_workflow -t my_ocrd_workflow . +docker build -t boxed-base -f Dockerfile-boxed-base . +docker build -t boxed-ocrd_calamari -f Dockerfile-boxed-ocrd_calamari . +docker build -t boxed-dinglehopper -f Dockerfile-boxed-dinglehopper . +docker build -t boxed-ocrd_olena -f Dockerfile-boxed-ocrd_olena . +docker build -t boxed-ocrd_tesserocr -f Dockerfile-boxed-ocrd_tesserocr . +docker build -t boxed-sbb_textline_detector -f Dockerfile-boxed-sbb_textline_detector . diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d97256c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector - -ocrd >= 2.13.1 - -# XXX See https://github.com/OCR-D/ocrd_tesserocr/issues/135 -# ocrd_tesserocr >= 0.8.XXX -https://github.com/mikegerber/ocrd_tesserocr/archive/fix/set-pcgtsid.tar.gz - -ocrd_calamari >= 0.0.7 - -https://github.com/qurator-spk/sbb_textline_detector/archive/8b01d9e.tar.gz - -https://github.com/qurator-spk/dinglehopper/archive/2b98f69.tar.gz diff --git a/run b/run index 7fbf16d..5fb76c4 100755 --- a/run +++ b/run @@ -1,31 +1,42 @@ -#!/bin/sh -# Run the my_ocrd_workflow container on the current workspace +#!/bin/bash set -e # Abort on error -DOCKER_IMAGE=${DOCKER_IMAGE:-my_ocrd_workflow:latest} # default to locally built - -if echo "$DOCKER_IMAGE" | grep -q "/"; then - docker pull "$DOCKER_IMAGE" -fi +self=`realpath $0` +self_dir=`dirname "$self"` -# XXX Work around podman vs docker uid behaviour +# Docker run options +docker_run_options="--rm -t" +docker_run_options="$docker_run_options --mount type=bind,src=\"$(pwd)\",target=/data" +# In podman, the container always runs as the real user == uid 0 in container if docker -v 2>&1 | grep -q podman; then user="0:0" else user="`id -u`:`id -g`" fi - - -# The container currently needs to run privileged to allow it to read from e.g. +docker_run_options="$docker_run_options --user $user" +docker_run_options="$docker_run_options -e LOG_LEVEL=$LOG_LEVEL" +# The containers currently need to run privileged to allow it to read from e.g. # /home on SELinux secured systems such as Fedora. We might want to use udica # instead in the future. +docker_run_options="$docker_run_options --privileged=true" + + +# Build aliases for the containerized ocrd processors +build_alias() { + local command=$1 + local docker_image=$2 + + alias $command="docker run $docker_run_options $docker_image $command" +} +shopt -s expand_aliases # Required for non-interactive shells +build_alias ocrd boxed-base +build_alias ocrd-olena-binarize boxed-ocrd_olena +build_alias ocrd-sbb-textline-detector boxed-sbb_textline_detector +build_alias ocrd-calamari-recognize boxed-ocrd_calamari +build_alias ocrd-tesserocr-recognize boxed-ocrd_tesserocr +build_alias ocrd-dinglehopper boxed-dinglehopper + -docker run --privileged=true --rm -t \ - \ - --user $user \ - --mount type=bind,src="$(pwd)",target=/data \ - \ - -e LOG_LEVEL=$LOG_LEVEL \ - $DOCKER_IMAGE "$@" +. $self_dir/my_ocrd_workflow diff --git a/run-docker-hub b/run-docker-hub deleted file mode 100755 index 9e4339e..0000000 --- a/run-docker-hub +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# Run the my_ocrd_workflow container on the current workspace - -DOCKER_IMAGE=mikegerber/my_ocrd_workflow:stable `dirname $0`/run "$@"