Merge branch 'feat/boxed-processors'

pull/27/head
Gerber, Mike 4 years ago
commit c5cd3f17e2

@ -7,7 +7,7 @@ git:
submodules: false # Avoid trying to checkout private data/ submodule
stages:
- name: "Build Docker image"
- name: "Build Docker images"
- name: "Test"
- name: "Deploy Docker image - latest"
if: branch = master
@ -16,24 +16,40 @@ stages:
jobs:
include:
- stage: "Build Docker image"
- stage: "Build Docker images"
script:
- sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
# We are using the image my_ocrd_workflow to cache, so pull and tag it
- docker pull $DOCKER_USERNAME/my_ocrd_workflow
- docker tag $DOCKER_USERNAME/my_ocrd_workflow my_ocrd_workflow
# We are using the images to cache, so pull and tag it
- |
for x in $sub_images; do
docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x || true
docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x my_ocrd_workflow-$x || true
done
- FORCE_DOWNLOAD=y ./build
- docker tag my_ocrd_workflow $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- |
set -e
for x in $sub_images; do
docker tag my_ocrd_workflow-$x $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
done
- docker images
- docker push $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- |
set -e
for x in $sub_images; do
docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
done
- stage: "Test"
script:
- docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT my_ocrd_workflow
- sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- |
for x in $sub_images; do
docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT my_ocrd_workflow-$x
done
- curl -O https://qurator-data.de/examples/actevedef_718448162.first-page.zip
- unzip actevedef_718448162.first-page.zip
@ -48,17 +64,25 @@ jobs:
- stage: "Deploy Docker image - latest"
env: DOCKER_TAG=latest
script:
- sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
- docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
- docker push $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
- |
for x in $sub_images; do
docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
done
- stage: "Deploy Docker image - tagged"
env: DOCKER_TAG=$TRAVIS_TAG
script:
- sub_images=`ls -1 Dockerfile-* | sed 's/Dockerfile-//'`
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
- docker pull $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT
- docker tag $DOCKER_USERNAME/my_ocrd_workflow:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
- docker push $DOCKER_USERNAME/my_ocrd_workflow:$DOCKER_TAG
- |
for x in $sub_images; do
docker pull $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT
docker tag $DOCKER_USERNAME/my_ocrd_workflow-$x:$TRAVIS_COMMIT $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
docker push $DOCKER_USERNAME/my_ocrd_workflow-$x:$DOCKER_TAG
done

@ -1,86 +0,0 @@
FROM ubuntu:18.04
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
ENV PIP_DEFAULT_TIMEOUT=120
ENV OCRD_OLENA_VERSION 1.2.0
ENV TESSDATA_BEST_VERSION 4.0.0
ENV TESSDATA_PREFIX /usr/local/share/tessdata
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
apt-get update && \
apt-get install -y \
curl xz-utils \
python3-pip \
git \
software-properties-common \
# For clstm on Ubuntu 19.04:
swig libeigen3-dev libpng-dev libprotobuf-dev \
# For cv2:
libsm6 libxrender1 \
# For ocrd_olena:
imagemagick \
# XML utils
libxml2-utils \
xmlstarlet \
&& \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install Leptonica and Tesseract.
RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
apt-get update && \
apt-get install -y \
tesseract-ocr \
libtesseract-dev \
&& \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Set up OCR-D logging
COPY ocrd_logging.py /etc/
# Build ocrd_olena
# XXX .deb needs an update
RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \
dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \
rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \
apt-get update && \
apt-get -f install -y && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip3 install --no-cache-dir --upgrade pip && \
curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
mkdir ocrd_olena && \
tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
cd ocrd_olena && \
sed -i 's/^install: deps$/install:/' Makefile && \
pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \
make install PREFIX=/usr/local && \
cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
# Copy OCR models
RUN mkdir -p /var/lib/calamari-models
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
RUN mkdir -p $TESSDATA_PREFIX
ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
COPY data/textline_detection /var/lib/textline_detection
# Install requirements
# Using pipdeptree here to get more info than from pip3 check
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir --use-feature=2020-resolver -r /tmp/requirements.txt && \
pip3 install --no-cache-dir pipdeptree && \
pipdeptree -w fail
COPY my_ocrd_workflow /usr/bin/
COPY xsd/* /usr/share/xml/
WORKDIR /data
ENTRYPOINT ["/usr/bin/my_ocrd_workflow"]

@ -0,0 +1,49 @@
FROM ubuntu:18.04
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
ENV PIP_DEFAULT_TIMEOUT=120
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
apt-get update && \
apt-get install -y \
curl xz-utils \
build-essential python3-dev \
# For get-pip.py:
python3-distutils \
# For add-apt-repository:
software-properties-common \
# XML utils
libxml2-utils \
xmlstarlet \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set up OCR-D logging
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
# Install pip (and setuptools)
# We use get-pip.py here to avoid
# a. having to upgrade from Ubuntu's pip
# b. the dreaded "old script wrapper" error message
RUN curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
python3 get-pip.py && \
rm -f get-pip.py
# Install pip installable-stuff
RUN pip3 install --no-cache-dir \
'ocrd >= 2.13.1'
# Check pip dependencies
RUN pip3 check
WORKDIR /data
# Default command
CMD ['ocrd']

@ -0,0 +1,18 @@
FROM my_ocrd_workflow-core
ENV DINGLEHOPPER_COMMIT 2b98f69
# Build pip installable stuff
RUN pip3 install --no-cache-dir \
# Now the real stuff:
https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz
# Check pip dependencies
RUN pip3 check
# Default command
CMD ["ocrd-dinglehopper"]

@ -0,0 +1,24 @@
FROM my_ocrd_workflow-core
# Build pip installable stuff
RUN pip3 install --no-cache-dir \
# Resolve conflicts early:
'tensorflow-gpu == 1.15.*' \
'calamari-ocr == 0.3.5' \
# Now the real stuff:
'ocrd_calamari >= 0.0.7'
# Copy OCR models
RUN mkdir -p /var/lib/calamari-models
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
# Check pip dependencies
RUN pip3 check
# Default command
CMD ["ocrd-calamari-recognize"]

@ -0,0 +1,32 @@
FROM my_ocrd_workflow-core
ENV OCRD_OLENA_VERSION 1.2.0
# Build ocrd_olena
RUN apt-get update && \
apt-get install -y \
imagemagick \
&& \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \
dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \
rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \
apt-get update && \
apt-get -f install -y && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
mkdir ocrd_olena && \
tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
cd ocrd_olena && \
sed -i 's/^install: deps$/install:/' Makefile && \
pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \
make install PREFIX=/usr/local && \
cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
# Check pip dependencies
RUN pip3 check
# Default command
CMD ['ocrd-olena-binarize']

@ -0,0 +1,35 @@
FROM my_ocrd_workflow-core
ENV TESSDATA_BEST_VERSION 4.0.0
ENV TESSDATA_PREFIX /usr/local/share/tessdata
# Install Leptonica and Tesseract.
RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
apt-get update && \
apt-get install -y \
tesseract-ocr \
libtesseract-dev \
&& \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Copy OCR models
RUN mkdir -p $TESSDATA_PREFIX
ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
# Build pip installable stuff
RUN pip3 install --no-cache-dir \
# Now the real stuff:
'ocrd_tesserocr >= 0.9.0'
# Check pip dependencies
RUN pip3 check
# Default command
CMD ["ocrd-tesserocr-recognize"]

@ -0,0 +1,22 @@
FROM my_ocrd_workflow-core
ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e
# Build pip installable stuff
RUN pip3 install --no-cache-dir \
# Now the real stuff:
https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
# Copy OCR models
COPY data/textline_detection /var/lib/textline_detection
# Check pip dependencies
RUN pip3 check
# Default command
CMD ["ocrd-sbb-textline-detector"]

@ -21,4 +21,9 @@ get_from_web() {
handle_data
docker build --cache-from my_ocrd_workflow -t my_ocrd_workflow .
docker build -t my_ocrd_workflow-core -f Dockerfile-core .
docker build -t my_ocrd_workflow-ocrd_calamari -f Dockerfile-ocrd_calamari .
docker build -t my_ocrd_workflow-dinglehopper -f Dockerfile-dinglehopper .
docker build -t my_ocrd_workflow-ocrd_olena -f Dockerfile-ocrd_olena .
docker build -t my_ocrd_workflow-ocrd_tesserocr -f Dockerfile-ocrd_tesserocr .
docker build -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector .

@ -82,7 +82,7 @@ main() {
if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then
pip3 list
pip3 list || true
fi
main

@ -1 +0,0 @@
setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))

@ -1,13 +0,0 @@
tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector
ocrd >= 2.13.1
# XXX See https://github.com/OCR-D/ocrd_tesserocr/issues/135
# ocrd_tesserocr >= 0.8.XXX
https://github.com/mikegerber/ocrd_tesserocr/archive/fix/set-pcgtsid.tar.gz
ocrd_calamari >= 0.0.7
https://github.com/qurator-spk/sbb_textline_detector/archive/8b01d9e.tar.gz
https://github.com/qurator-spk/dinglehopper/archive/2b98f69.tar.gz

47
run

@ -1,31 +1,42 @@
#!/bin/sh
# Run the my_ocrd_workflow container on the current workspace
#!/bin/bash
set -e # Abort on error
DOCKER_IMAGE=${DOCKER_IMAGE:-my_ocrd_workflow:latest} # default to locally built
if echo "$DOCKER_IMAGE" | grep -q "/"; then
docker pull "$DOCKER_IMAGE"
fi
self=`realpath $0`
self_dir=`dirname "$self"`
# XXX Work around podman vs docker uid behaviour
# Docker run options
docker_run_options="--rm -t"
docker_run_options="$docker_run_options --mount type=bind,src=\"$(pwd)\",target=/data"
# In podman, the container always runs as the real user == uid 0 in container
if docker -v 2>&1 | grep -q podman; then
user="0:0"
else
user="`id -u`:`id -g`"
fi
# The container currently needs to run privileged to allow it to read from e.g.
docker_run_options="$docker_run_options --user $user"
docker_run_options="$docker_run_options -e LOG_LEVEL=$LOG_LEVEL"
# The containers currently need to run privileged to allow it to read from e.g.
# /home on SELinux secured systems such as Fedora. We might want to use udica
# instead in the future.
docker_run_options="$docker_run_options --privileged=true"
# Build aliases for the containerized ocrd processors
build_alias() {
local command=$1
local docker_image=$2
alias $command="docker run $docker_run_options $docker_image $command"
}
shopt -s expand_aliases # Required for non-interactive shells
build_alias ocrd my_ocrd_workflow-core
build_alias ocrd-olena-binarize my_ocrd_workflow-ocrd_olena
build_alias ocrd-sbb-textline-detector my_ocrd_workflow-sbb_textline_detector
build_alias ocrd-calamari-recognize my_ocrd_workflow-ocrd_calamari
build_alias ocrd-tesserocr-recognize my_ocrd_workflow-ocrd_tesserocr
build_alias ocrd-dinglehopper my_ocrd_workflow-dinglehopper
docker run --privileged=true --rm -t \
\
--user $user \
--mount type=bind,src="$(pwd)",target=/data \
\
-e LOG_LEVEL=$LOG_LEVEL \
$DOCKER_IMAGE "$@"
. $self_dir/my_ocrd_workflow

@ -1,4 +0,0 @@
#!/bin/sh
# Run the my_ocrd_workflow container on the current workspace
DOCKER_IMAGE=mikegerber/my_ocrd_workflow:stable `dirname $0`/run "$@"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save