Merge branch 'feat/boxed-processors'
commit
c5cd3f17e2
@ -1,86 +0,0 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||
ENV PIP_DEFAULT_TIMEOUT=120
|
||||
|
||||
ENV OCRD_OLENA_VERSION 1.2.0
|
||||
ENV TESSDATA_BEST_VERSION 4.0.0
|
||||
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
||||
|
||||
|
||||
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
curl xz-utils \
|
||||
python3-pip \
|
||||
git \
|
||||
software-properties-common \
|
||||
# For clstm on Ubuntu 19.04:
|
||||
swig libeigen3-dev libpng-dev libprotobuf-dev \
|
||||
# For cv2:
|
||||
libsm6 libxrender1 \
|
||||
# For ocrd_olena:
|
||||
imagemagick \
|
||||
# XML utils
|
||||
libxml2-utils \
|
||||
xmlstarlet \
|
||||
&& \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Install Leptonica and Tesseract.
|
||||
RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
tesseract-ocr \
|
||||
libtesseract-dev \
|
||||
&& \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set up OCR-D logging
|
||||
COPY ocrd_logging.py /etc/
|
||||
|
||||
|
||||
# Build ocrd_olena
|
||||
# XXX .deb needs an update
|
||||
RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \
|
||||
dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \
|
||||
rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \
|
||||
apt-get update && \
|
||||
apt-get -f install -y && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
RUN pip3 install --no-cache-dir --upgrade pip && \
|
||||
curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
|
||||
mkdir ocrd_olena && \
|
||||
tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
|
||||
cd ocrd_olena && \
|
||||
sed -i 's/^install: deps$/install:/' Makefile && \
|
||||
pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \
|
||||
make install PREFIX=/usr/local && \
|
||||
cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
|
||||
|
||||
|
||||
# Copy OCR models
|
||||
RUN mkdir -p /var/lib/calamari-models
|
||||
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
|
||||
RUN mkdir -p $TESSDATA_PREFIX
|
||||
ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
|
||||
COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
|
||||
COPY data/textline_detection /var/lib/textline_detection
|
||||
|
||||
|
||||
# Install requirements
|
||||
# Using pipdeptree here to get more info than from pip3 check
|
||||
COPY requirements.txt /tmp/
|
||||
RUN pip3 install --no-cache-dir --upgrade pip && \
|
||||
pip3 install --no-cache-dir --use-feature=2020-resolver -r /tmp/requirements.txt && \
|
||||
pip3 install --no-cache-dir pipdeptree && \
|
||||
pipdeptree -w fail
|
||||
|
||||
|
||||
COPY my_ocrd_workflow /usr/bin/
|
||||
COPY xsd/* /usr/share/xml/
|
||||
|
||||
|
||||
WORKDIR /data
|
||||
ENTRYPOINT ["/usr/bin/my_ocrd_workflow"]
|
@ -0,0 +1,49 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||
ENV PIP_DEFAULT_TIMEOUT=120
|
||||
|
||||
|
||||
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
curl xz-utils \
|
||||
build-essential python3-dev \
|
||||
# For get-pip.py:
|
||||
python3-distutils \
|
||||
# For add-apt-repository:
|
||||
software-properties-common \
|
||||
# XML utils
|
||||
libxml2-utils \
|
||||
xmlstarlet \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Set up OCR-D logging
|
||||
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
|
||||
|
||||
|
||||
# Install pip (and setuptools)
|
||||
# We use get-pip.py here to avoid
|
||||
# a. having to upgrade from Ubuntu's pip
|
||||
# b. the dreaded "old script wrapper" error message
|
||||
RUN curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
|
||||
python3 get-pip.py && \
|
||||
rm -f get-pip.py
|
||||
|
||||
|
||||
# Install pip installable-stuff
|
||||
RUN pip3 install --no-cache-dir \
|
||||
'ocrd >= 2.13.1'
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
WORKDIR /data
|
||||
|
||||
# Default command
|
||||
CMD ['ocrd']
|
@ -0,0 +1,18 @@
|
||||
FROM my_ocrd_workflow-core
|
||||
|
||||
|
||||
ENV DINGLEHOPPER_COMMIT 2b98f69
|
||||
|
||||
|
||||
# Build pip installable stuff
|
||||
RUN pip3 install --no-cache-dir \
|
||||
# Now the real stuff:
|
||||
https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
# Default command
|
||||
CMD ["ocrd-dinglehopper"]
|
@ -0,0 +1,24 @@
|
||||
FROM my_ocrd_workflow-core
|
||||
|
||||
|
||||
# Build pip installable stuff
|
||||
RUN pip3 install --no-cache-dir \
|
||||
# Resolve conflicts early:
|
||||
'tensorflow-gpu == 1.15.*' \
|
||||
'calamari-ocr == 0.3.5' \
|
||||
# Now the real stuff:
|
||||
'ocrd_calamari >= 0.0.7'
|
||||
|
||||
|
||||
# Copy OCR models
|
||||
RUN mkdir -p /var/lib/calamari-models
|
||||
COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR
|
||||
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
# Default command
|
||||
CMD ["ocrd-calamari-recognize"]
|
@ -0,0 +1,32 @@
|
||||
FROM my_ocrd_workflow-core
|
||||
|
||||
ENV OCRD_OLENA_VERSION 1.2.0
|
||||
|
||||
# Build ocrd_olena
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
imagemagick \
|
||||
&& \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \
|
||||
dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \
|
||||
rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \
|
||||
apt-get update && \
|
||||
apt-get -f install -y && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \
|
||||
mkdir ocrd_olena && \
|
||||
tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \
|
||||
cd ocrd_olena && \
|
||||
sed -i 's/^install: deps$/install:/' Makefile && \
|
||||
pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \
|
||||
make install PREFIX=/usr/local && \
|
||||
cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
# Default command
|
||||
CMD ['ocrd-olena-binarize']
|
@ -0,0 +1,35 @@
|
||||
FROM my_ocrd_workflow-core
|
||||
|
||||
|
||||
ENV TESSDATA_BEST_VERSION 4.0.0
|
||||
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
||||
|
||||
|
||||
# Install Leptonica and Tesseract.
|
||||
RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
tesseract-ocr \
|
||||
libtesseract-dev \
|
||||
&& \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Copy OCR models
|
||||
RUN mkdir -p $TESSDATA_PREFIX
|
||||
ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
|
||||
COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
|
||||
|
||||
|
||||
# Build pip installable stuff
|
||||
RUN pip3 install --no-cache-dir \
|
||||
# Now the real stuff:
|
||||
'ocrd_tesserocr >= 0.9.0'
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
# Default command
|
||||
CMD ["ocrd-tesserocr-recognize"]
|
@ -0,0 +1,22 @@
|
||||
FROM my_ocrd_workflow-core
|
||||
|
||||
|
||||
ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e
|
||||
|
||||
|
||||
# Build pip installable stuff
|
||||
RUN pip3 install --no-cache-dir \
|
||||
# Now the real stuff:
|
||||
https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
|
||||
|
||||
|
||||
# Copy OCR models
|
||||
COPY data/textline_detection /var/lib/textline_detection
|
||||
|
||||
|
||||
# Check pip dependencies
|
||||
RUN pip3 check
|
||||
|
||||
|
||||
# Default command
|
||||
CMD ["ocrd-sbb-textline-detector"]
|
@ -1 +0,0 @@
|
||||
setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))
|
@ -1,13 +0,0 @@
|
||||
tensorflow-gpu < 2.0 # Needed for sbb_text_linedetector
|
||||
|
||||
ocrd >= 2.13.1
|
||||
|
||||
# XXX See https://github.com/OCR-D/ocrd_tesserocr/issues/135
|
||||
# ocrd_tesserocr >= 0.8.XXX
|
||||
https://github.com/mikegerber/ocrd_tesserocr/archive/fix/set-pcgtsid.tar.gz
|
||||
|
||||
ocrd_calamari >= 0.0.7
|
||||
|
||||
https://github.com/qurator-spk/sbb_textline_detector/archive/8b01d9e.tar.gz
|
||||
|
||||
https://github.com/qurator-spk/dinglehopper/archive/2b98f69.tar.gz
|
@ -1,31 +1,42 @@
|
||||
#!/bin/sh
|
||||
# Run the my_ocrd_workflow container on the current workspace
|
||||
#!/bin/bash
|
||||
|
||||
set -e # Abort on error
|
||||
|
||||
DOCKER_IMAGE=${DOCKER_IMAGE:-my_ocrd_workflow:latest} # default to locally built
|
||||
|
||||
if echo "$DOCKER_IMAGE" | grep -q "/"; then
|
||||
docker pull "$DOCKER_IMAGE"
|
||||
fi
|
||||
self=`realpath $0`
|
||||
self_dir=`dirname "$self"`
|
||||
|
||||
|
||||
# XXX Work around podman vs docker uid behaviour
|
||||
# Docker run options
|
||||
docker_run_options="--rm -t"
|
||||
docker_run_options="$docker_run_options --mount type=bind,src=\"$(pwd)\",target=/data"
|
||||
# In podman, the container always runs as the real user == uid 0 in container
|
||||
if docker -v 2>&1 | grep -q podman; then
|
||||
user="0:0"
|
||||
else
|
||||
user="`id -u`:`id -g`"
|
||||
fi
|
||||
|
||||
|
||||
# The container currently needs to run privileged to allow it to read from e.g.
|
||||
docker_run_options="$docker_run_options --user $user"
|
||||
docker_run_options="$docker_run_options -e LOG_LEVEL=$LOG_LEVEL"
|
||||
# The containers currently need to run privileged to allow it to read from e.g.
|
||||
# /home on SELinux secured systems such as Fedora. We might want to use udica
|
||||
# instead in the future.
|
||||
docker_run_options="$docker_run_options --privileged=true"
|
||||
|
||||
|
||||
# Build aliases for the containerized ocrd processors
|
||||
build_alias() {
|
||||
local command=$1
|
||||
local docker_image=$2
|
||||
|
||||
alias $command="docker run $docker_run_options $docker_image $command"
|
||||
}
|
||||
shopt -s expand_aliases # Required for non-interactive shells
|
||||
build_alias ocrd my_ocrd_workflow-core
|
||||
build_alias ocrd-olena-binarize my_ocrd_workflow-ocrd_olena
|
||||
build_alias ocrd-sbb-textline-detector my_ocrd_workflow-sbb_textline_detector
|
||||
build_alias ocrd-calamari-recognize my_ocrd_workflow-ocrd_calamari
|
||||
build_alias ocrd-tesserocr-recognize my_ocrd_workflow-ocrd_tesserocr
|
||||
build_alias ocrd-dinglehopper my_ocrd_workflow-dinglehopper
|
||||
|
||||
|
||||
docker run --privileged=true --rm -t \
|
||||
\
|
||||
--user $user \
|
||||
--mount type=bind,src="$(pwd)",target=/data \
|
||||
\
|
||||
-e LOG_LEVEL=$LOG_LEVEL \
|
||||
$DOCKER_IMAGE "$@"
|
||||
. $self_dir/my_ocrd_workflow
|
||||
|
@ -1,4 +0,0 @@
|
||||
#!/bin/sh
|
||||
# Run the my_ocrd_workflow container on the current workspace
|
||||
|
||||
DOCKER_IMAGE=mikegerber/my_ocrd_workflow:stable `dirname $0`/run "$@"
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue