mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-06-10 07:09:52 +02:00
Merge branch 'master' of https://github.com/qurator-spk/ocrd-galley
This commit is contained in:
commit
4c1f198da9
39 changed files with 532 additions and 383 deletions
74
.drone.star
74
.drone.star
|
@ -1,74 +0,0 @@
|
||||||
def main(ctx):
|
|
||||||
tags = [ctx.build.commit]
|
|
||||||
|
|
||||||
if ctx.build.event == "tag":
|
|
||||||
name = "release"
|
|
||||||
elif ctx.build.branch == "master":
|
|
||||||
name = "master"
|
|
||||||
tags.append("latest")
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
"kind": "pipeline",
|
|
||||||
"name": name,
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"name": "prepare data",
|
|
||||||
"image": "alpine",
|
|
||||||
"commands": [
|
|
||||||
"apk update && apk add bash curl",
|
|
||||||
"FORCE_DOWNLOAD=y ./build-tmp-XXX"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
# We can't glob and have to add here manually...
|
|
||||||
step_for(ctx, "core", tags),
|
|
||||||
step_for(ctx, "core-cuda10.0", tags),
|
|
||||||
step_for(ctx, "core-cuda10.1", tags),
|
|
||||||
|
|
||||||
step_for(ctx, "dinglehopper", tags),
|
|
||||||
step_for(ctx, "ocrd_calamari", tags),
|
|
||||||
step_for(ctx, "ocrd_calamari03", tags),
|
|
||||||
step_for(ctx, "ocrd_cis", tags),
|
|
||||||
step_for(ctx, "ocrd_fileformat", tags),
|
|
||||||
step_for(ctx, "ocrd_olena", tags),
|
|
||||||
step_for(ctx, "ocrd_segment", tags),
|
|
||||||
step_for(ctx, "ocrd_tesserocr", tags),
|
|
||||||
step_for(ctx, "ocrd_wrap", tags),
|
|
||||||
step_for(ctx, "sbb_binarization", tags),
|
|
||||||
step_for(ctx, "sbb_textline_detector", tags),
|
|
||||||
step_for(ctx, "eynollah", tags),
|
|
||||||
step_for(ctx, "ocrd_anybaseocr", tags),
|
|
||||||
{
|
|
||||||
"name": "notify",
|
|
||||||
"image": "drillster/drone-email",
|
|
||||||
"settings": {
|
|
||||||
"host": "172.17.0.1",
|
|
||||||
"port": "25",
|
|
||||||
"from": "drone@ci.moegen-wir.net",
|
|
||||||
},
|
|
||||||
"when": {
|
|
||||||
"status": [ "success", "failure" ]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def step_for(ctx, sub_image, tags):
|
|
||||||
return {
|
|
||||||
"name": "build %s" % sub_image,
|
|
||||||
"image": "plugins/docker",
|
|
||||||
"settings": {
|
|
||||||
"build_args": [
|
|
||||||
"DRONE_COMMIT=%s" % ctx.build.commit,
|
|
||||||
],
|
|
||||||
"tags": tags,
|
|
||||||
"username": { "from_secret": "docker_username" },
|
|
||||||
"password": { "from_secret": "docker_password" },
|
|
||||||
"repo": "quratorspk/ocrd-galley-%s" % sub_image,
|
|
||||||
"dockerfile": "Dockerfile-%s" % sub_image,
|
|
||||||
}
|
|
||||||
}
|
|
38
.github/list-subimages
vendored
Executable file
38
.github/list-subimages
vendored
Executable file
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import glob
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
all_subimages = {re.sub(r"^Dockerfile-", "", dockerfile) for dockerfile in glob.glob("Dockerfile-*")}
|
||||||
|
core_subimages = {si for si in all_subimages if si.startswith("core")}
|
||||||
|
rest_subimages = all_subimages - core_subimages
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='List subimages.')
|
||||||
|
parser.add_argument('--core', action='store_true',
|
||||||
|
default=False, help='List core subimages')
|
||||||
|
parser.add_argument('--rest', action='store_true',
|
||||||
|
default=False, help='List rest subimages')
|
||||||
|
parser.add_argument('--json', action='store_true',
|
||||||
|
default=False, help='Return list as JSON')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def list_(subimages):
|
||||||
|
subimages = sorted(subimages)
|
||||||
|
if args.json:
|
||||||
|
print(json.dumps(subimages))
|
||||||
|
else:
|
||||||
|
print("\n".join(subimages))
|
||||||
|
|
||||||
|
|
||||||
|
if not args.core and not args.rest:
|
||||||
|
list_(core_subimages | rest_subimages)
|
||||||
|
if args.core:
|
||||||
|
list_(core_subimages)
|
||||||
|
if args.rest:
|
||||||
|
list_(rest_subimages)
|
58
.github/workflows/build-subimage.yml
vendored
Normal file
58
.github/workflows/build-subimage.yml
vendored
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
subimage:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
tags:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
secrets:
|
||||||
|
DOCKERHUB_USERNAME:
|
||||||
|
required: true
|
||||||
|
DOCKERHUB_TOKEN:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-subimage-job:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
# We are checking out explicitly, so build-push-action isn't trying
|
||||||
|
# to checkout the (unreachable) submodule. (Using "context" there.)
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2
|
||||||
|
-
|
||||||
|
name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v4
|
||||||
|
with:
|
||||||
|
images: |
|
||||||
|
quratorspk/ocrd-galley-${{ inputs.subimage }}
|
||||||
|
flavor: |
|
||||||
|
latest=auto
|
||||||
|
# latest=auto should generate "latest" for the type=semver tags entry
|
||||||
|
tags: ${{ inputs.tags }}
|
||||||
|
-
|
||||||
|
name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
-
|
||||||
|
name: Build ${{ inputs.subimage }}
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile-${{ inputs.subimage }}
|
||||||
|
build-args: |
|
||||||
|
GIT_COMMIT=sha-${{ github.sha }}
|
||||||
|
BUILDKIT_INLINE_CACHE=1
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
push: true
|
||||||
|
|
||||||
|
cache-from: quratorspk/ocrd-galley-${{ inputs.subimage }}:sha-${{ github.sha }}
|
102
.github/workflows/build.yml
vendored
102
.github/workflows/build.yml
vendored
|
@ -1,36 +1,104 @@
|
||||||
name: build
|
name: build
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- 'master'
|
- 'master'
|
||||||
- 'test/github-actions'
|
- 'fix/*'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
docker:
|
matrix:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
core: ${{ steps.step1.outputs.core }}
|
||||||
|
rest: ${{ steps.step1.outputs.rest }}
|
||||||
|
all: ${{ steps.step1.outputs.all }}
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Checkout
|
name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
# We are checking out explicitly, so build-push-action isn't trying
|
|
||||||
# to checkout the (unreachable) submodule. (Using "context" there.)
|
|
||||||
|
|
||||||
-
|
-
|
||||||
name: Set up Docker Buildx
|
name: Generate outputs
|
||||||
uses: docker/setup-buildx-action@v2
|
id: step1
|
||||||
|
run: |
|
||||||
|
echo "core=$(./.github/list-subimages --core --json)" >>$GITHUB_OUTPUT
|
||||||
|
echo "rest=$(./.github/list-subimages --rest --json)" >>$GITHUB_OUTPUT
|
||||||
|
echo "all=$(./.github/list-subimages --json)" >>$GITHUB_OUTPUT
|
||||||
|
|
||||||
# TODO data
|
echo "GITHUB_OUTPUT:"
|
||||||
# TODO matrix for all Dockerfiles
|
cat $GITHUB_OUTPUT
|
||||||
|
|
||||||
-
|
build-core:
|
||||||
name: Build
|
needs: matrix
|
||||||
uses: docker/build-push-action@v4
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
subimage: ${{ fromJson(needs.matrix.outputs.core) }}
|
||||||
|
uses: ./.github/workflows/build-subimage.yml
|
||||||
with:
|
with:
|
||||||
context: .
|
subimage: ${{ matrix.subimage }}
|
||||||
file: Dockerfile-core
|
tags: |
|
||||||
build-args:
|
type=sha,format=long
|
||||||
DRONE_COMMIT=${{ github.sha }}
|
# Here: NOT the full tags, just the sha! (they get added below)
|
||||||
push: false
|
secrets: inherit
|
||||||
|
|
||||||
|
build-rest:
|
||||||
|
needs: [matrix, build-core]
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
subimage: ${{ fromJson(needs.matrix.outputs.rest) }}
|
||||||
|
uses: ./.github/workflows/build-subimage.yml
|
||||||
|
with:
|
||||||
|
subimage: ${{ matrix.subimage }}
|
||||||
|
tags: |
|
||||||
|
type=sha,format=long
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
|
||||||
|
test:
|
||||||
|
needs: build-rest
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
DOCKER_IMAGE_TAG: sha-${{ github.sha }} # needed to run the correct version through the wrapper
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
-
|
||||||
|
name: Install wrapper
|
||||||
|
run: |
|
||||||
|
sudo apt-get install -y python3-pip
|
||||||
|
cd wrapper && pip install .
|
||||||
|
-
|
||||||
|
name: Test
|
||||||
|
run: |
|
||||||
|
ocrd --version
|
||||||
|
ocrd-dinglehopper --version
|
||||||
|
|
||||||
|
|
||||||
|
# At this point, we have successfully built, uploaded and tested the images. We now just need to add
|
||||||
|
# tags. We do this by building again, but using the formerly built images to
|
||||||
|
# cache from.
|
||||||
|
|
||||||
|
push-with-tags:
|
||||||
|
needs: [matrix, test]
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
subimage: ${{ fromJson(needs.matrix.outputs.all) }}
|
||||||
|
uses: ./.github/workflows/build-subimage.yml
|
||||||
|
with:
|
||||||
|
subimage: ${{ matrix.subimage }}
|
||||||
|
tags: |
|
||||||
|
type=sha,format=long
|
||||||
|
type=edge,branch=master
|
||||||
|
type=ref,event=branch
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
# Here: full tags
|
||||||
|
# Note: Do NOT use event=tag here, unless re-configuring the "latest"
|
||||||
|
# behavior too as that triggers on event=tag by default. By default,
|
||||||
|
# "latest" triggers on type=semver here, too (which is wanted).
|
||||||
|
secrets: inherit
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,5 @@
|
||||||
|
build/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -1,3 +0,0 @@
|
||||||
[submodule "data"]
|
|
||||||
path = data
|
|
||||||
url = git@code.dev.sbb.berlin:qurator/qurator-data.git
|
|
|
@ -1,7 +1,7 @@
|
||||||
FROM ubuntu:18.04
|
FROM ubuntu:22.04
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_VERSION_MINIMUM="2.23.3"
|
ARG OCRD_VERSION_MINIMUM="2.47.0"
|
||||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||||
ENV PIP_DEFAULT_TIMEOUT=120
|
ENV PIP_DEFAULT_TIMEOUT=120
|
||||||
|
|
||||||
|
@ -9,10 +9,11 @@ ENV PIP_DEFAULT_TIMEOUT=120
|
||||||
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
curl xz-utils \
|
build-essential \
|
||||||
build-essential python3-dev \
|
curl \
|
||||||
# For get-pip.py:
|
git \
|
||||||
python3-distutils \
|
xz-utils \
|
||||||
|
pkg-config \
|
||||||
# For add-apt-repository:
|
# For add-apt-repository:
|
||||||
software-properties-common \
|
software-properties-common \
|
||||||
# XML utils
|
# XML utils
|
||||||
|
@ -20,6 +21,17 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
||||||
xmlstarlet \
|
xmlstarlet \
|
||||||
# OCR-D uses ImageMagick for pixel density estimation
|
# OCR-D uses ImageMagick for pixel density estimation
|
||||||
imagemagick \
|
imagemagick \
|
||||||
|
# pyenv builds
|
||||||
|
# TODO: builder container?
|
||||||
|
libz-dev \
|
||||||
|
libssl-dev \
|
||||||
|
libbz2-dev \
|
||||||
|
liblzma-dev \
|
||||||
|
libncurses-dev \
|
||||||
|
libffi-dev \
|
||||||
|
libreadline-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
libmagic-dev \
|
||||||
&& \
|
&& \
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
@ -29,14 +41,19 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
||||||
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
|
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
|
||||||
|
|
||||||
|
|
||||||
# Install pip (and setuptools)
|
# Install pyenv
|
||||||
# We use get-pip.py here to avoid
|
# TODO: do not run as root
|
||||||
# a. having to upgrade from Ubuntu's pip
|
# TODO: does just saying "3.7" work as intended?
|
||||||
# b. the dreaded "old script wrapper" error message
|
ENV HOME=/root
|
||||||
RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \
|
ENV PYENV_ROOT=/usr/local/share/pyenv
|
||||||
python3 get-pip.py && \
|
ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
|
||||||
rm -f get-pip.py
|
RUN \
|
||||||
|
git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \
|
||||||
|
pyenv install 3.7 && \
|
||||||
|
pyenv global 3.7 && \
|
||||||
|
pyenv rehash && \
|
||||||
|
pip install -U pip wheel && \
|
||||||
|
pip install setuptools
|
||||||
|
|
||||||
# Install pip installable-stuff
|
# Install pip installable-stuff
|
||||||
RUN ${PIP_INSTALL} \
|
RUN ${PIP_INSTALL} \
|
||||||
|
|
|
@ -1,53 +0,0 @@
|
||||||
FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04
|
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
|
||||||
ARG OCRD_VERSION_MINIMUM="2.23.3"
|
|
||||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
|
||||||
ENV PIP_DEFAULT_TIMEOUT=120
|
|
||||||
|
|
||||||
|
|
||||||
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
curl xz-utils \
|
|
||||||
build-essential python3-dev \
|
|
||||||
# For get-pip.py:
|
|
||||||
python3-distutils \
|
|
||||||
# For add-apt-repository:
|
|
||||||
software-properties-common \
|
|
||||||
# XML utils
|
|
||||||
libxml2-utils \
|
|
||||||
xmlstarlet \
|
|
||||||
# OCR-D uses ImageMagick for pixel density estimation
|
|
||||||
imagemagick \
|
|
||||||
&& \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
|
|
||||||
# Set up OCR-D logging
|
|
||||||
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
|
|
||||||
|
|
||||||
|
|
||||||
# Install pip (and setuptools)
|
|
||||||
# We use get-pip.py here to avoid
|
|
||||||
# a. having to upgrade from Ubuntu's pip
|
|
||||||
# b. the dreaded "old script wrapper" error message
|
|
||||||
RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \
|
|
||||||
python3 get-pip.py && \
|
|
||||||
rm -f get-pip.py
|
|
||||||
|
|
||||||
|
|
||||||
# Install pip installable-stuff
|
|
||||||
RUN ${PIP_INSTALL} \
|
|
||||||
"ocrd >= ${OCRD_VERSION_MINIMUM}"
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
|
||||||
RUN pip check
|
|
||||||
|
|
||||||
|
|
||||||
WORKDIR /data
|
|
||||||
|
|
||||||
# Default command
|
|
||||||
CMD ['ocrd']
|
|
|
@ -1,53 +0,0 @@
|
||||||
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
|
||||||
ARG OCRD_VERSION_MINIMUM="2.23.3"
|
|
||||||
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
|
||||||
ENV PIP_DEFAULT_TIMEOUT=120
|
|
||||||
|
|
||||||
|
|
||||||
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
curl xz-utils \
|
|
||||||
build-essential python3-dev \
|
|
||||||
# For get-pip.py:
|
|
||||||
python3-distutils \
|
|
||||||
# For add-apt-repository:
|
|
||||||
software-properties-common \
|
|
||||||
# XML utils
|
|
||||||
libxml2-utils \
|
|
||||||
xmlstarlet \
|
|
||||||
# OCR-D uses ImageMagick for pixel density estimation
|
|
||||||
imagemagick \
|
|
||||||
&& \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
|
|
||||||
# Set up OCR-D logging
|
|
||||||
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
|
|
||||||
|
|
||||||
|
|
||||||
# Install pip (and setuptools)
|
|
||||||
# We use get-pip.py here to avoid
|
|
||||||
# a. having to upgrade from Ubuntu's pip
|
|
||||||
# b. the dreaded "old script wrapper" error message
|
|
||||||
RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \
|
|
||||||
python3 get-pip.py && \
|
|
||||||
rm -f get-pip.py
|
|
||||||
|
|
||||||
|
|
||||||
# Install pip installable-stuff
|
|
||||||
RUN ${PIP_INSTALL} \
|
|
||||||
"ocrd >= ${OCRD_VERSION_MINIMUM}"
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
|
||||||
RUN pip check
|
|
||||||
|
|
||||||
|
|
||||||
WORKDIR /data
|
|
||||||
|
|
||||||
# Default command
|
|
||||||
CMD ['ocrd']
|
|
70
Dockerfile-core-cuda12.1
Normal file
70
Dockerfile-core-cuda12.1
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
|
ARG OCRD_VERSION_MINIMUM="2.47.0"
|
||||||
|
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
|
||||||
|
ENV PIP_DEFAULT_TIMEOUT=120
|
||||||
|
|
||||||
|
|
||||||
|
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
xz-utils \
|
||||||
|
pkg-config \
|
||||||
|
# For add-apt-repository:
|
||||||
|
software-properties-common \
|
||||||
|
# XML utils
|
||||||
|
libxml2-utils \
|
||||||
|
xmlstarlet \
|
||||||
|
# OCR-D uses ImageMagick for pixel density estimation
|
||||||
|
imagemagick \
|
||||||
|
# pyenv builds
|
||||||
|
# TODO: builder container?
|
||||||
|
libz-dev \
|
||||||
|
libssl-dev \
|
||||||
|
libbz2-dev \
|
||||||
|
liblzma-dev \
|
||||||
|
libncurses-dev \
|
||||||
|
libffi-dev \
|
||||||
|
libreadline-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
libmagic-dev \
|
||||||
|
&& \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
# Set up OCR-D logging
|
||||||
|
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
|
||||||
|
|
||||||
|
|
||||||
|
# Install pyenv
|
||||||
|
# TODO: do not run as root
|
||||||
|
# TODO: does just saying "3.7" work as intended?
|
||||||
|
ENV HOME=/root
|
||||||
|
ENV PYENV_ROOT=/usr/local/share/pyenv
|
||||||
|
ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
|
||||||
|
RUN \
|
||||||
|
git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \
|
||||||
|
pyenv install 3.7 && \
|
||||||
|
pyenv global 3.7 && \
|
||||||
|
pyenv rehash && \
|
||||||
|
pip install -U pip wheel && \
|
||||||
|
pip install setuptools
|
||||||
|
|
||||||
|
# Install pip installable-stuff
|
||||||
|
RUN ${PIP_INSTALL} \
|
||||||
|
"ocrd >= ${OCRD_VERSION_MINIMUM}"
|
||||||
|
|
||||||
|
|
||||||
|
# Check pip dependencies
|
||||||
|
RUN pip check
|
||||||
|
|
||||||
|
|
||||||
|
WORKDIR /data
|
||||||
|
|
||||||
|
# Default command
|
||||||
|
CMD ['ocrd']
|
|
@ -1,14 +1,13 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG DINGLEHOPPER_COMMIT="dcc10c5"
|
ARG DINGLEHOPPER_VERSION="0.9.2"
|
||||||
|
|
||||||
|
|
||||||
# Build pip installable stuff
|
# Build pip installable stuff
|
||||||
RUN ${PIP_INSTALL} \
|
RUN ${PIP_INSTALL} \
|
||||||
# Now the real stuff:
|
"dinglehopper == $DINGLEHOPPER_VERSION"
|
||||||
https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG EYNOLLAH_VERSION="0.0.10"
|
ARG EYNOLLAH_VERSION="0.3.0"
|
||||||
|
|
||||||
|
|
||||||
# Build pip installable stuff
|
# Build pip installable stuff
|
||||||
|
@ -10,10 +10,6 @@ RUN ${PIP_INSTALL} \
|
||||||
"eynollah == ${EYNOLLAH_VERSION}"
|
"eynollah == ${EYNOLLAH_VERSION}"
|
||||||
|
|
||||||
|
|
||||||
# Copy OCR models
|
|
||||||
COPY data/eynollah /var/lib/eynollah
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
RUN pip check
|
RUN pip check
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_ANYBASEOCR_VERSION="1.8.2"
|
ARG OCRD_ANYBASEOCR_VERSION="1.8.2"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
|
||||||
|
|
||||||
|
|
||||||
# XXX https://github.com/OCR-D/core/issues/642
|
# XXX https://github.com/OCR-D/core/issues/642
|
||||||
|
@ -12,14 +12,6 @@ RUN ${PIP_INSTALL} \
|
||||||
"ocrd_calamari == $OCRD_CALAMARI_VERSION"
|
"ocrd_calamari == $OCRD_CALAMARI_VERSION"
|
||||||
|
|
||||||
|
|
||||||
# Copy OCR models
|
|
||||||
RUN mkdir -p /var/lib/calamari-models/GT4HistOCR
|
|
||||||
COPY data/calamari-models/GT4HistOCR/2019-12-11T11_10+0100 /var/lib/calamari-models/GT4HistOCR/2019-12-11T11_10+0100
|
|
||||||
# XXX experimental
|
|
||||||
#COPY data/calamari-models/GT4HistOCR/2019-12-18T17_24+0100-with-augmentation-UNTESTED /var/lib/calamari-models/GT4HistOCR/2019-12-18T17_24+0100
|
|
||||||
#COPY data/mirror/github.com/Calamari-OCR/calamari_models/gt4histocr /var/lib/calamari-models/GT4HistOCR-chreul
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
RUN pip check
|
RUN pip check
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
|
|
||||||
|
@ -13,12 +13,6 @@ RUN ${PIP_INSTALL} \
|
||||||
'ocrd_calamari == 0.0.7'
|
'ocrd_calamari == 0.0.7'
|
||||||
|
|
||||||
|
|
||||||
# Copy OCR models
|
|
||||||
RUN mkdir -p /var/lib/calamari-models/GT4HistOCR
|
|
||||||
COPY data/calamari-models/GT4HistOCR/2019-07-22T15_49+0200 /var/lib/calamari-models/GT4HistOCR/2019-07-22T15_49+0200
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
RUN pip check
|
RUN pip check
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_CIS_VERSION="0.1.5"
|
ARG OCRD_CIS_VERSION="0.1.5"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_FILEFORMAT_VERSION="0.5.0"
|
ARG OCRD_FILEFORMAT_VERSION="0.5.0"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_OLENA_VERSION="1.3.0"
|
ARG OCRD_OLENA_VERSION="1.3.0"
|
||||||
|
@ -26,6 +26,7 @@ RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena
|
||||||
cd ocrd_olena && \
|
cd ocrd_olena && \
|
||||||
sed -i 's/^install: deps$/install:/' Makefile && \
|
sed -i 's/^install: deps$/install:/' Makefile && \
|
||||||
${PIP_INSTALL} ocrd && \
|
${PIP_INSTALL} ocrd && \
|
||||||
|
make deps-ubuntu && \
|
||||||
make install PREFIX=/usr/local && \
|
make install PREFIX=/usr/local && \
|
||||||
cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
|
cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_SEGMENT_VERSION="0.1.21"
|
ARG OCRD_SEGMENT_VERSION="0.1.21"
|
||||||
|
|
|
@ -1,15 +1,17 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG TESSDATA_BEST_VERSION="4.0.0"
|
ARG TESSDATA_BEST_VERSION="4.0.0"
|
||||||
ARG OCRD_TESSEROCR_VERSION="0.16.0"
|
ARG OCRD_TESSEROCR_VERSION="0.17.0"
|
||||||
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
||||||
|
|
||||||
|
|
||||||
# Install Leptonica and Tesseract.
|
# Install Leptonica and Tesseract.
|
||||||
RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
|
# TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1,
|
||||||
apt-get update && \
|
# alex-p has 4.1.3, but not for jammy.)
|
||||||
|
# RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
|
||||||
|
RUN apt-get update && \
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
libtesseract-dev \
|
libtesseract-dev \
|
||||||
|
@ -17,18 +19,10 @@ RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
# Copy OCR models
|
|
||||||
RUN mkdir -p $TESSDATA_PREFIX
|
|
||||||
ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
|
|
||||||
COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
|
|
||||||
RUN curl -sSL -O https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.502_198857.traineddata && \
|
|
||||||
mv *.traineddata $TESSDATA_PREFIX/
|
|
||||||
|
|
||||||
# Build pip installable stuff
|
# Build pip installable stuff
|
||||||
RUN ${PIP_INSTALL} \
|
RUN ${PIP_INSTALL} \
|
||||||
"ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}"
|
"ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}"
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
RUN pip check
|
RUN pip check
|
||||||
|
|
||||||
|
|
18
Dockerfile-ocrd_trocr
Normal file
18
Dockerfile-ocrd_trocr
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
ARG GIT_COMMIT="latest"
|
||||||
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
|
ARG OCRD_TROCR_COMMIT="250ff1c"
|
||||||
|
|
||||||
|
|
||||||
|
# Build pip installable stuff
|
||||||
|
RUN ${PIP_INSTALL} \
|
||||||
|
https://github.com/qurator-spk/ocrd_trocr/archive/$OCRD_TROCR_COMMIT.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Check pip dependencies
|
||||||
|
RUN pip check
|
||||||
|
|
||||||
|
|
||||||
|
# Default command
|
||||||
|
CMD ["ocrd-trocr-recognize"]
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG OCRD_WRAP_VERSION="0.1.7"
|
ARG OCRD_WRAP_VERSION="0.1.7"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG SBB_BINARIZATION_VERSION="0.0.10"
|
ARG SBB_BINARIZATION_VERSION="0.0.10"
|
||||||
|
@ -11,10 +11,6 @@ RUN ${PIP_INSTALL} \
|
||||||
"sbb_binarization == $SBB_BINARIZATION_VERSION"
|
"sbb_binarization == $SBB_BINARIZATION_VERSION"
|
||||||
|
|
||||||
|
|
||||||
# Copy models
|
|
||||||
COPY data/sbb_binarization/2021-03-09 /var/lib/sbb_binarization
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
RUN pip check
|
RUN pip check
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
ARG DRONE_COMMIT="latest"
|
ARG GIT_COMMIT="latest"
|
||||||
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
|
FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip install --no-cache-dir"
|
||||||
ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6"
|
ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6"
|
||||||
|
@ -12,10 +12,6 @@ RUN ${PIP_INSTALL} \
|
||||||
https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
|
https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
|
||||||
|
|
||||||
|
|
||||||
# Copy OCR models
|
|
||||||
COPY data/textline_detection /var/lib/textline_detection
|
|
||||||
|
|
||||||
|
|
||||||
# Check pip dependencies
|
# Check pip dependencies
|
||||||
RUN pip check
|
RUN pip check
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ How to add a processor
|
||||||
* Add model download to `build` (if necessary)
|
* Add model download to `build` (if necessary)
|
||||||
* Add a Dockerfile
|
* Add a Dockerfile
|
||||||
* Add commands to `wrapper/qurator/ocrd_galley/cli.py`
|
* Add commands to `wrapper/qurator/ocrd_galley/cli.py`
|
||||||
* Add to `.drone.star`
|
|
||||||
|
|
||||||
Releasing
|
Releasing
|
||||||
---------
|
---------
|
||||||
|
@ -27,3 +26,12 @@ issue should be open that reminds us to go back to a versioned release again.
|
||||||
|
|
||||||
Other than relying on "proper releases", this also has a second purpose: Review
|
Other than relying on "proper releases", this also has a second purpose: Review
|
||||||
releases of qurator-spk releases.
|
releases of qurator-spk releases.
|
||||||
|
|
||||||
|
|
||||||
|
Test builds
|
||||||
|
-----------
|
||||||
|
XXX Review this
|
||||||
|
```
|
||||||
|
GIT_COMMIT=test ./build Dockerfile-core Dockerfile-ocrd_tesserocr
|
||||||
|
DOCKER_IMAGE_TAG=test ./test-ocrd_tesserocr.sh
|
||||||
|
```
|
||||||
|
|
25
README.md
25
README.md
|
@ -29,14 +29,9 @@ including all dependencies in Docker.
|
||||||
|
|
||||||
How to use
|
How to use
|
||||||
----------
|
----------
|
||||||
**Currently, due to problems with the Travis CI, we do not provide pre-built
|
ocrd-galley uses Docker to run the OCR-D images. We provide pre-built container
|
||||||
containers anymore.***
|
images that get downloaded automatically when you run the provided wrappers for
|
||||||
|
the OCR-D processors.
|
||||||
To build the containers yourself using Docker:
|
|
||||||
~~~
|
|
||||||
cd ~/devel/ocrd-galley/
|
|
||||||
./build
|
|
||||||
~~~
|
|
||||||
|
|
||||||
You can then install the wrappers into a Python venv:
|
You can then install the wrappers into a Python venv:
|
||||||
~~~
|
~~~
|
||||||
|
@ -44,9 +39,13 @@ cd ~/devel/ocrd-galley/wrapper
|
||||||
pip install .
|
pip install .
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
|
To download models, you need to use the `-a` flag of `ocrd resmgr`:
|
||||||
|
~~~
|
||||||
|
ocrd resmgr download -a ocrd-calamari-recognize default
|
||||||
|
~~~
|
||||||
|
|
||||||
You may then use the script `my_ocrd_workflow` to use your self-built
|
You may then use the script `my_ocrd_workflow` to use your self-built
|
||||||
containers on an example workspace:
|
containers on an example workspace:
|
||||||
|
|
||||||
~~~
|
~~~
|
||||||
# Download an example workspace
|
# Download an example workspace
|
||||||
cd /tmp
|
cd /tmp
|
||||||
|
@ -110,3 +109,11 @@ cd workspace-xxxxx # output by the last command
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
This produces a workspace from the files and then runs the OCR workflow on it.
|
This produces a workspace from the files and then runs the OCR workflow on it.
|
||||||
|
|
||||||
|
Build the containers yourself
|
||||||
|
-----------------------------
|
||||||
|
To build the containers yourself using Docker:
|
||||||
|
~~~
|
||||||
|
cd ~/devel/ocrd-galley/
|
||||||
|
./build
|
||||||
|
~~~
|
||||||
|
|
32
build
32
build
|
@ -22,38 +22,10 @@ echo "$sub_images"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
DATA_SUBDIR=data
|
|
||||||
get_from_annex() {
|
|
||||||
annex_get 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200/*.ckpt*'
|
|
||||||
annex_get 'calamari-models/GT4HistOCR/2019-12-11T11_10+0100/*.ckpt*'
|
|
||||||
annex_get 'calamari-models/GT4HistOCR/2019-12-18T17_24+0100*/*.ckpt*'
|
|
||||||
annex_get 'mirror/github.com/Calamari-OCR/calamari_models/gt4histocr/*.ckpt*'
|
|
||||||
annex_get 'tesseract-models/GT4HistOCR/*.traineddata'
|
|
||||||
annex_get 'textline_detection/*.h5'
|
|
||||||
annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
|
||||||
annex_get 'sbb_binarization/2021-03-09/*.h5'
|
|
||||||
annex_get 'eynollah/*.h5'
|
|
||||||
}
|
|
||||||
get_from_web() {
|
|
||||||
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
|
|
||||||
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz' 'calamari-models/GT4HistOCR/2019-12-11T11_10+0100'
|
|
||||||
download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR'
|
|
||||||
download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection'
|
|
||||||
download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/2021-03-09/models.tar.gz' 'sbb_binarization/2021-03-09'
|
|
||||||
download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
|
||||||
download_to --strip-components 1 'https://qurator-data.de/eynollah/models_eynollah.tar.gz' 'eynollah'
|
|
||||||
}
|
|
||||||
. $self_dir/qurator_data_lib.sh
|
|
||||||
handle_data
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Update base images if we build a core image
|
# Update base images if we build a core image
|
||||||
if echo "$sub_images" | grep -q core; then
|
if echo "$sub_images" | grep -q core; then
|
||||||
docker pull ubuntu:18.04
|
docker pull ubuntu:22.04
|
||||||
docker pull nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04
|
docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
||||||
docker pull nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for sub_image in $sub_images; do
|
for sub_image in $sub_images; do
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/bash -x
|
|
||||||
set -e
|
|
||||||
|
|
||||||
self=`realpath $0`
|
|
||||||
self_dir=`dirname "$self"`
|
|
||||||
|
|
||||||
DATA_SUBDIR=data
|
|
||||||
get_from_web() {
|
|
||||||
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200'
|
|
||||||
download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz' 'calamari-models/GT4HistOCR/2019-12-11T11_10+0100'
|
|
||||||
download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR'
|
|
||||||
download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection'
|
|
||||||
download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/2021-03-09/models.tar.gz' 'sbb_binarization/2021-03-09'
|
|
||||||
download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz'
|
|
||||||
download_to --strip-components 1 'https://qurator-data.de/eynollah/models_eynollah.tar.gz' 'eynollah'
|
|
||||||
}
|
|
||||||
. $self_dir/qurator_data_lib.sh
|
|
||||||
handle_data
|
|
1
data
1
data
|
@ -1 +0,0 @@
|
||||||
Subproject commit 9ab08a3626dde1d38dd622b65e425277cd029722
|
|
14
test-core.sh
Executable file
14
test-core.sh
Executable file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
test_id=`basename $0`
|
||||||
|
cd `mktemp -d /tmp/$test_id-XXXXX`
|
||||||
|
|
||||||
|
# Prepare test workspace
|
||||||
|
wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
unzip actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
cd actevedef_718448162.first-page+binarization+segmentation
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
ocrd workspace validate \
|
||||||
|
--page-coordinate-consistency off
|
13
test-dinglehopper.sh
Executable file
13
test-dinglehopper.sh
Executable file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
test_id=`basename $0`
|
||||||
|
cd `mktemp -d /tmp/$test_id-XXXXX`
|
||||||
|
|
||||||
|
# Prepare test workspace
|
||||||
|
wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
unzip actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
cd actevedef_718448162.first-page+binarization+segmentation
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-SEG-LINE-SBB -O DINGLEHOPPER-TEST
|
12
test-ocrd_olena.sh
Executable file
12
test-ocrd_olena.sh
Executable file
|
@ -0,0 +1,12 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
cd `mktemp -d /tmp/test-ocrd_olena-XXXXX`
|
||||||
|
|
||||||
|
# Prepare test workspace
|
||||||
|
wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
unzip actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
cd actevedef_718448162.first-page+binarization+segmentation
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
ocrd-olena-binarize -I OCR-D-IMG -O TEST-OLENA
|
17
test-ocrd_tesserocr.sh
Executable file
17
test-ocrd_tesserocr.sh
Executable file
|
@ -0,0 +1,17 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
cd `mktemp -d /tmp/test-ocrd_tesserocr-XXXXX`
|
||||||
|
|
||||||
|
# Prepare processors
|
||||||
|
ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata
|
||||||
|
|
||||||
|
# Prepare test workspace
|
||||||
|
wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
unzip actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
cd actevedef_718448162.first-page+binarization+segmentation
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O TEST-TESS-SEG-REG
|
||||||
|
ocrd-tesserocr-segment-line -I TEST-TESS-SEG-REG -O TEST-TESS-SEG-LINE
|
||||||
|
ocrd-tesserocr-recognize -I TEST-TESS-SEG-LINE -O TEST-TESS-OCR -P model Fraktur_GT4HistOCR
|
14
test-ocrd_trocr.sh
Executable file
14
test-ocrd_trocr.sh
Executable file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
cd `mktemp -d /tmp/test-ocrd_trocr-XXXXX`
|
||||||
|
|
||||||
|
# Prepare processors
|
||||||
|
|
||||||
|
# Prepare test workspace
|
||||||
|
wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
unzip actevedef_718448162.first-page+binarization+segmentation.zip
|
||||||
|
cd actevedef_718448162.first-page+binarization+segmentation
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR
|
|
@ -1 +0,0 @@
|
||||||
from .cli import *
|
|
|
@ -1,61 +1,48 @@
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import colorama
|
||||||
|
from pathlib import Path
|
||||||
|
from termcolor import colored
|
||||||
|
|
||||||
|
from .sub_images import sub_images
|
||||||
|
|
||||||
DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley")
|
DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley")
|
||||||
DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest")
|
DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest")
|
||||||
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
||||||
|
|
||||||
|
# xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
|
||||||
|
# to just roll it on our own.
|
||||||
|
XDG_CONFIG_HOME = os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config")
|
||||||
|
XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME", Path.home() / ".local" / "share")
|
||||||
|
XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")
|
||||||
|
|
||||||
sub_images = {
|
# ocrd_tesserocr
|
||||||
"ocrd": "core",
|
TESSDATA_PREFIX = XDG_DATA_HOME / "ocrd-resources" / "ocrd-tesserocr-recognize"
|
||||||
"ocrd-olena-binarize": "ocrd_olena",
|
|
||||||
"ocrd-sbb-binarize": "sbb_binarization",
|
|
||||||
"ocrd-sbb-textline-detector": "sbb_textline_detector",
|
|
||||||
"ocrd-calamari-recognize": "ocrd_calamari",
|
|
||||||
"ocrd-calamari-recognize03": "ocrd_calamari03",
|
|
||||||
"ocrd-tesserocr-segment-region": "ocrd_tesserocr",
|
|
||||||
"ocrd-tesserocr-segment-line": "ocrd_tesserocr",
|
|
||||||
"ocrd-tesserocr-recognize": "ocrd_tesserocr",
|
|
||||||
"ocrd-dinglehopper": "dinglehopper",
|
|
||||||
"ocrd-cis-ocropy-clip": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-resegment": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-segment": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-deskew": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-denoise": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-binarize": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-dewarp": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-recognize": "ocrd_cis",
|
|
||||||
"ocrd-fileformat-transform": "ocrd_fileformat",
|
|
||||||
"ocrd-segment-extract-pages": "ocrd_segment",
|
|
||||||
"ocrd-segment-extract-regions": "ocrd_segment",
|
|
||||||
"ocrd-segment-extract-lines": "ocrd_segment",
|
|
||||||
"ocrd-segment-from-masks": "ocrd_segment",
|
|
||||||
"ocrd-segment-from-coco": "ocrd_segment",
|
|
||||||
"ocrd-segment-repair": "ocrd_segment",
|
|
||||||
"ocrd-segment-evaluate": "ocrd_segment",
|
|
||||||
"ocrd-preprocess-image": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-normalize": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-denoise-raw": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-binarize": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-denoise": "ocrd_wrap",
|
|
||||||
"ocrd-eynollah-segment": "eynollah",
|
|
||||||
"ocrd-anybaseocr-crop": "ocrd_anybaseocr",
|
|
||||||
"ocrd-anybaseocr-deskew": "ocrd_anybaseocr",
|
|
||||||
|
|
||||||
# non OCR-D CLI
|
|
||||||
"ocr-transform": "ocrd_fileformat",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
colorama.init()
|
||||||
|
|
||||||
argv = sys.argv.copy()
|
argv = sys.argv.copy()
|
||||||
argv[0] = os.path.basename(argv[0])
|
argv[0] = os.path.basename(argv[0])
|
||||||
|
|
||||||
|
# If we're running ocrd resmgr download we need to run the correct subimage.
|
||||||
|
if argv[:3] == ["ocrd", "resmgr", "download"] or \
|
||||||
|
argv[:3] == ["ocrd", "resmgr", "list-available"]:
|
||||||
|
# Default to the base image
|
||||||
sub_image = sub_images[argv[0]]
|
sub_image = sub_images[argv[0]]
|
||||||
|
# But look for a match of the executable
|
||||||
|
for x in argv[3:]:
|
||||||
|
if x in sub_images:
|
||||||
|
sub_image = sub_images[x]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
sub_image = sub_images[argv[0]]
|
||||||
|
|
||||||
docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG)
|
docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG)
|
||||||
|
|
||||||
|
if DOCKER_IMAGE_TAG != "latest":
|
||||||
|
print(colored(f"Using {docker_image}", 'red'))
|
||||||
docker_run(argv, docker_image)
|
docker_run(argv, docker_image)
|
||||||
|
|
||||||
|
|
||||||
|
@ -67,6 +54,29 @@ def docker_run(argv, docker_image):
|
||||||
docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL])
|
docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL])
|
||||||
docker_run_options.extend(["-e", "_OCRD_COMPLETE"])
|
docker_run_options.extend(["-e", "_OCRD_COMPLETE"])
|
||||||
|
|
||||||
|
# home directory
|
||||||
|
docker_run_options.extend(["-e", "HOME=%s" % Path.home()])
|
||||||
|
|
||||||
|
# .config
|
||||||
|
docker_run_options.extend(["-e", "XDG_CONFIG_HOME=%s" % XDG_CONFIG_HOME])
|
||||||
|
docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
|
||||||
|
(XDG_CONFIG_HOME, XDG_CONFIG_HOME)])
|
||||||
|
# .local/share
|
||||||
|
docker_run_options.extend(["-e", "XDG_DATA_HOME=%s" % XDG_DATA_HOME])
|
||||||
|
docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
|
||||||
|
(XDG_DATA_HOME, XDG_DATA_HOME)])
|
||||||
|
# .cache
|
||||||
|
docker_run_options.extend(["-e", "XDG_CACHE_HOME=%s" % XDG_CACHE_HOME])
|
||||||
|
docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
|
||||||
|
(XDG_CACHE_HOME, XDG_CACHE_HOME)])
|
||||||
|
# .huggingface
|
||||||
|
os.makedirs(Path.home() / ".huggingface", exist_ok=True)
|
||||||
|
docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
|
||||||
|
(Path.home() / ".huggingface", Path("/root") / ".huggingface")])
|
||||||
|
|
||||||
|
# ocrd_tesserocr
|
||||||
|
docker_run_options.extend(["-e", "TESSDATA_PREFIX=%s" % TESSDATA_PREFIX])
|
||||||
|
|
||||||
# JAVA_TOOL_OPTIONS is used for Java proxy settings
|
# JAVA_TOOL_OPTIONS is used for Java proxy settings
|
||||||
if os.environ.get("JAVA_TOOL_OPTIONS"):
|
if os.environ.get("JAVA_TOOL_OPTIONS"):
|
||||||
docker_run_options.extend(["-e", "JAVA_TOOL_OPTIONS"])
|
docker_run_options.extend(["-e", "JAVA_TOOL_OPTIONS"])
|
||||||
|
|
40
wrapper/qurator/ocrd_galley/sub_images.py
Normal file
40
wrapper/qurator/ocrd_galley/sub_images.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
sub_images = {
|
||||||
|
"ocrd": "core",
|
||||||
|
"ocrd-olena-binarize": "ocrd_olena",
|
||||||
|
"ocrd-sbb-binarize": "sbb_binarization",
|
||||||
|
"ocrd-sbb-textline-detector": "sbb_textline_detector",
|
||||||
|
"ocrd-calamari-recognize": "ocrd_calamari",
|
||||||
|
"ocrd-calamari-recognize03": "ocrd_calamari03",
|
||||||
|
"ocrd-tesserocr-segment-region": "ocrd_tesserocr",
|
||||||
|
"ocrd-tesserocr-segment-line": "ocrd_tesserocr",
|
||||||
|
"ocrd-tesserocr-recognize": "ocrd_tesserocr",
|
||||||
|
"ocrd-dinglehopper": "dinglehopper",
|
||||||
|
"ocrd-cis-ocropy-clip": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-resegment": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-segment": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-deskew": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-denoise": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-binarize": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-dewarp": "ocrd_cis",
|
||||||
|
"ocrd-cis-ocropy-recognize": "ocrd_cis",
|
||||||
|
"ocrd-fileformat-transform": "ocrd_fileformat",
|
||||||
|
"ocrd-segment-extract-pages": "ocrd_segment",
|
||||||
|
"ocrd-segment-extract-regions": "ocrd_segment",
|
||||||
|
"ocrd-segment-extract-lines": "ocrd_segment",
|
||||||
|
"ocrd-segment-from-masks": "ocrd_segment",
|
||||||
|
"ocrd-segment-from-coco": "ocrd_segment",
|
||||||
|
"ocrd-segment-repair": "ocrd_segment",
|
||||||
|
"ocrd-segment-evaluate": "ocrd_segment",
|
||||||
|
"ocrd-preprocess-image": "ocrd_wrap",
|
||||||
|
"ocrd-skimage-normalize": "ocrd_wrap",
|
||||||
|
"ocrd-skimage-denoise-raw": "ocrd_wrap",
|
||||||
|
"ocrd-skimage-binarize": "ocrd_wrap",
|
||||||
|
"ocrd-skimage-denoise": "ocrd_wrap",
|
||||||
|
"ocrd-eynollah-segment": "eynollah",
|
||||||
|
"ocrd-anybaseocr-crop": "ocrd_anybaseocr",
|
||||||
|
"ocrd-anybaseocr-deskew": "ocrd_anybaseocr",
|
||||||
|
"ocrd-trocr-recognize": "ocrd_trocr",
|
||||||
|
|
||||||
|
# non OCR-D CLI
|
||||||
|
"ocr-transform": "ocrd_fileformat",
|
||||||
|
}
|
2
wrapper/requirements.txt
Normal file
2
wrapper/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
colorama
|
||||||
|
termcolor
|
|
@ -1,9 +1,12 @@
|
||||||
from io import open
|
from io import open
|
||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
from qurator.ocrd_galley.cli import sub_images
|
from qurator.ocrd_galley.sub_images import sub_images
|
||||||
console_scripts = ["%s=qurator.ocrd_galley.cli:main" % command for command in sub_images.keys()]
|
console_scripts = ["%s=qurator.ocrd_galley.cli:main" % command for command in sub_images.keys()]
|
||||||
|
|
||||||
|
with open("requirements.txt") as fp:
|
||||||
|
install_requires = fp.read()
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="ocrd-galley",
|
name="ocrd-galley",
|
||||||
author="Mike Gerber, The QURATOR SPK Team",
|
author="Mike Gerber, The QURATOR SPK Team",
|
||||||
|
@ -13,6 +16,7 @@ setup(
|
||||||
license="Apache",
|
license="Apache",
|
||||||
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
||||||
namespace_packages=["qurator"],
|
namespace_packages=["qurator"],
|
||||||
|
install_requires=install_requires,
|
||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": console_scripts,
|
"console_scripts": console_scripts,
|
||||||
},
|
},
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue