Merge branch 'test/github-actions'

master
Gerber, Mike 1 year ago
commit 0cfeb76bba

@ -1,74 +0,0 @@
def main(ctx):
tags = [ctx.build.commit]
if ctx.build.event == "tag":
name = "release"
elif ctx.build.branch == "master":
name = "master"
tags.append("latest")
else:
return
return [
{
"kind": "pipeline",
"name": name,
"steps": [
{
"name": "prepare data",
"image": "alpine",
"commands": [
"apk update && apk add bash curl",
"FORCE_DOWNLOAD=y ./build-tmp-XXX"
]
},
# We can't glob and have to add here manually...
step_for(ctx, "core", tags),
step_for(ctx, "core-cuda10.0", tags),
step_for(ctx, "core-cuda10.1", tags),
step_for(ctx, "dinglehopper", tags),
step_for(ctx, "ocrd_calamari", tags),
step_for(ctx, "ocrd_calamari03", tags),
step_for(ctx, "ocrd_cis", tags),
step_for(ctx, "ocrd_fileformat", tags),
step_for(ctx, "ocrd_olena", tags),
step_for(ctx, "ocrd_segment", tags),
step_for(ctx, "ocrd_tesserocr", tags),
step_for(ctx, "ocrd_wrap", tags),
step_for(ctx, "sbb_binarization", tags),
step_for(ctx, "sbb_textline_detector", tags),
step_for(ctx, "eynollah", tags),
step_for(ctx, "ocrd_anybaseocr", tags),
{
"name": "notify",
"image": "drillster/drone-email",
"settings": {
"host": "172.17.0.1",
"port": "25",
"from": "drone@ci.moegen-wir.net",
},
"when": {
"status": [ "success", "failure" ]
}
}
]
}
]
def step_for(ctx, sub_image, tags):
return {
"name": "build %s" % sub_image,
"image": "plugins/docker",
"settings": {
"build_args": [
"DRONE_COMMIT=%s" % ctx.build.commit,
],
"tags": tags,
"username": { "from_secret": "docker_username" },
"password": { "from_secret": "docker_password" },
"repo": "quratorspk/ocrd-galley-%s" % sub_image,
"dockerfile": "Dockerfile-%s" % sub_image,
}
}

@ -0,0 +1,38 @@
#!/usr/bin/python3
import glob
import re
import sys
import argparse
import json
all_subimages = {re.sub(r"^Dockerfile-", "", dockerfile) for dockerfile in glob.glob("Dockerfile-*")}
core_subimages = {si for si in all_subimages if si.startswith("core")}
rest_subimages = all_subimages - core_subimages
parser = argparse.ArgumentParser(description='List subimages.')
parser.add_argument('--core', action='store_true',
default=False, help='List core subimages')
parser.add_argument('--rest', action='store_true',
default=False, help='List rest subimages')
parser.add_argument('--json', action='store_true',
default=False, help='Return list as JSON')
args = parser.parse_args()
def list_(subimages):
subimages = sorted(subimages)
if args.json:
print(json.dumps(subimages))
else:
print("\n".join(subimages))
if not args.core and not args.rest:
list_(core_subimages | rest_subimages)
if args.core:
list_(core_subimages)
if args.rest:
list_(rest_subimages)

@ -0,0 +1,58 @@
on:
workflow_call:
inputs:
subimage:
required: true
type: string
tags:
required: true
type: string
secrets:
DOCKERHUB_USERNAME:
required: true
DOCKERHUB_TOKEN:
required: true
jobs:
build-subimage-job:
runs-on: ubuntu-latest
steps:
-
name: Checkout
uses: actions/checkout@v3
# We are checking out explicitly, so build-push-action isn't trying
# to checkout the (unreachable) submodule. (Using "context" there.)
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
-
name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: |
quratorspk/ocrd-galley-${{ inputs.subimage }}
flavor: |
latest=auto
# latest=auto should generate "latest" for the type=semver tags entry
tags: ${{ inputs.tags }}
-
name: Login to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build ${{ inputs.subimage }}
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile-${{ inputs.subimage }}
build-args: |
GIT_COMMIT=sha-${{ github.sha }}
BUILDKIT_INLINE_CACHE=1
tags: ${{ steps.meta.outputs.tags }}
push: true
cache-from: quratorspk/ocrd-galley-${{ inputs.subimage }}:sha-${{ github.sha }}

@ -1,6 +1,7 @@
name: build
on:
workflow_dispatch:
push:
@ -9,28 +10,93 @@ on:
- 'test/github-actions'
jobs:
docker:
matrix:
runs-on: ubuntu-latest
outputs:
core: ${{ steps.step1.outputs.core }}
rest: ${{ steps.step1.outputs.rest }}
all: ${{ steps.step1.outputs.all }}
steps:
-
name: Checkout
uses: actions/checkout@v3
# We are checking out explicitly, so build-push-action isn't trying
# to checkout the (unreachable) submodule. (Using "context" there.)
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
name: Generate outputs
id: step1
run: |
echo "core=$(./.github/list-subimages --core --json)" >>$GITHUB_OUTPUT
echo "rest=$(./.github/list-subimages --rest --json)" >>$GITHUB_OUTPUT
echo "all=$(./.github/list-subimages --json)" >>$GITHUB_OUTPUT
echo "GITHUB_OUTPUT:"
cat $GITHUB_OUTPUT
build-core:
needs: matrix
strategy:
matrix:
subimage: ${{ fromJson(needs.matrix.outputs.core) }}
uses: ./.github/workflows/build-subimage.yml
with:
subimage: ${{ matrix.subimage }}
tags: |
type=sha,format=long
# Here: NOT the full tags, just the sha! (they get added below)
secrets: inherit
# TODO data
# TODO matrix for all Dockerfiles
build-rest:
needs: [matrix, build-core]
strategy:
matrix:
subimage: ${{ fromJson(needs.matrix.outputs.rest) }}
uses: ./.github/workflows/build-subimage.yml
with:
subimage: ${{ matrix.subimage }}
tags: |
type=sha,format=long
secrets: inherit
test:
needs: build-rest
runs-on: ubuntu-latest
env:
DOCKER_IMAGE_TAG: sha-${{ github.sha }} # needed to run the correct version through the wrapper
steps:
-
name: Build
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile-core
build-args:
DRONE_COMMIT=${{ github.sha }}
push: false
name: Checkout
uses: actions/checkout@v3
-
name: Install wrapper
run: |
sudo apt-get install -y python3-pip
cd wrapper && pip install .
-
name: Test
run: |
ocrd --version
ocrd-dinglehopper --version
# At this point, we have successfully built, uploaded and tested the images. We now just need to add
# tags. We do this by building again, but using the formerly built images to
# cache from.
push-with-tags:
needs: [matrix, test]
strategy:
matrix:
subimage: ${{ fromJson(needs.matrix.outputs.all) }}
uses: ./.github/workflows/build-subimage.yml
with:
subimage: ${{ matrix.subimage }}
tags: |
type=sha,format=long
type=edge,branch=master
type=ref,event=branch
type=semver,pattern={{version}}
# Here: full tags
# Note: Do NOT use event=tag here, unless re-configuring the "latest"
# behavior too as that triggers on event=tag by default. By default,
# "latest" triggers on type=semver here, too (which is wanted).
secrets: inherit

3
.gitmodules vendored

@ -1,3 +0,0 @@
[submodule "data"]
path = data
url = git@code.dev.sbb.berlin:qurator/qurator-data.git

@ -9,10 +9,10 @@ ENV PIP_DEFAULT_TIMEOUT=120
RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
apt-get update && \
apt-get install -y \
curl xz-utils \
build-essential python3-dev \
# For get-pip.py:
python3-distutils \
build-essential \
curl \
git \
xz-utils \
# For add-apt-repository:
software-properties-common \
# XML utils
@ -20,6 +20,17 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
xmlstarlet \
# OCR-D uses ImageMagick for pixel density estimation
imagemagick \
# pyenv builds
# TODO: builder container?
libz-dev \
libssl-dev \
libbz2-dev \
liblzma-dev \
libncurses-dev \
libffi-dev \
libreadline-dev \
libsqlite3-dev \
libmagic-dev \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
@ -29,14 +40,19 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
# Install pip (and setuptools)
# We use get-pip.py here to avoid
# a. having to upgrade from Ubuntu's pip
# b. the dreaded "old script wrapper" error message
RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \
python3 get-pip.py && \
rm -f get-pip.py
# Install pyenv
# TODO: do not run as root
# TODO: does just saying "3.7" work as intended?
ENV HOME=/root
ENV PYENV_ROOT=/usr/local/share/pyenv
ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN \
git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \
pyenv install 3.7 && \
pyenv global 3.7 && \
pyenv rehash && \
pip install -U pip && \
pip install setuptools
# Install pip installable-stuff
RUN ${PIP_INSTALL} \

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG DINGLEHOPPER_COMMIT="dcc10c5"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG EYNOLLAH_VERSION="0.0.10"
@ -10,10 +10,6 @@ RUN ${PIP_INSTALL} \
"eynollah == ${EYNOLLAH_VERSION}"
# Copy OCR models
COPY data/eynollah /var/lib/eynollah
# Check pip dependencies
RUN pip check

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.1:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG OCRD_ANYBASEOCR_VERSION="1.8.2"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.1:$GIT_COMMIT
# XXX https://github.com/OCR-D/core/issues/642
@ -12,14 +12,6 @@ RUN ${PIP_INSTALL} \
"ocrd_calamari == $OCRD_CALAMARI_VERSION"
# Copy OCR models
RUN mkdir -p /var/lib/calamari-models/GT4HistOCR
COPY data/calamari-models/GT4HistOCR/2019-12-11T11_10+0100 /var/lib/calamari-models/GT4HistOCR/2019-12-11T11_10+0100
# XXX experimental
#COPY data/calamari-models/GT4HistOCR/2019-12-18T17_24+0100-with-augmentation-UNTESTED /var/lib/calamari-models/GT4HistOCR/2019-12-18T17_24+0100
#COPY data/mirror/github.com/Calamari-OCR/calamari_models/gt4histocr /var/lib/calamari-models/GT4HistOCR-chreul
# Check pip dependencies
RUN pip check

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
@ -13,12 +13,6 @@ RUN ${PIP_INSTALL} \
'ocrd_calamari == 0.0.7'
# Copy OCR models
RUN mkdir -p /var/lib/calamari-models/GT4HistOCR
COPY data/calamari-models/GT4HistOCR/2019-07-22T15_49+0200 /var/lib/calamari-models/GT4HistOCR/2019-07-22T15_49+0200
# Check pip dependencies
RUN pip check

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG OCRD_CIS_VERSION="0.1.5"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG OCRD_FILEFORMAT_VERSION="0.5.0"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG OCRD_OLENA_VERSION="1.3.0"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG OCRD_SEGMENT_VERSION="0.1.21"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG TESSDATA_BEST_VERSION="4.0.0"
@ -17,13 +17,6 @@ RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Copy OCR models
RUN mkdir -p $TESSDATA_PREFIX
ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
RUN curl -sSL -O https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.502_198857.traineddata && \
mv *.traineddata $TESSDATA_PREFIX/
# Build pip installable stuff
RUN ${PIP_INSTALL} \
"ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG OCRD_WRAP_VERSION="0.1.7"

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG SBB_BINARIZATION_VERSION="0.0.10"
@ -11,10 +11,6 @@ RUN ${PIP_INSTALL} \
"sbb_binarization == $SBB_BINARIZATION_VERSION"
# Copy models
COPY data/sbb_binarization/2021-03-09 /var/lib/sbb_binarization
# Check pip dependencies
RUN pip check

@ -1,5 +1,5 @@
ARG DRONE_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
ARG GIT_COMMIT="latest"
FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
ARG PIP_INSTALL="pip install --no-cache-dir"
ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6"
@ -12,10 +12,6 @@ RUN ${PIP_INSTALL} \
https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
# Copy OCR models
COPY data/textline_detection /var/lib/textline_detection
# Check pip dependencies
RUN pip check

@ -29,14 +29,9 @@ including all dependencies in Docker.
How to use
----------
**Currently, due to problems with the Travis CI, we do not provide pre-built
containers anymore.***
To build the containers yourself using Docker:
~~~
cd ~/devel/ocrd-galley/
./build
~~~
ocrd-galley uses Docker to run the OCR-D images. We provide pre-built container
images that get downloaded automatically when you run the provided wrappers for
the OCR-D processors.
You can then install the wrappers into a Python venv:
~~~
@ -44,9 +39,13 @@ cd ~/devel/ocrd-galley/wrapper
pip install .
~~~
To download models, you need to use the `-a` flag of `ocrd resmgr`:
~~~
ocrd resmgr download -a ocrd-calamari-recognize default
~~~
You may then use the script `my_ocrd_workflow` to use your self-built
containers on an example workspace:
~~~
# Download an example workspace
cd /tmp
@ -110,3 +109,11 @@ cd workspace-xxxxx # output by the last command
~~~
This produces a workspace from the files and then runs the OCR workflow on it.
Build the containers yourself
-----------------------------
To build the containers yourself using Docker:
~~~
cd ~/devel/ocrd-galley/
./build
~~~

@ -45,7 +45,7 @@ get_from_web() {
download_to --strip-components 1 'https://qurator-data.de/eynollah/models_eynollah.tar.gz' 'eynollah'
}
. $self_dir/qurator_data_lib.sh
handle_data
#handle_data

@ -1 +0,0 @@
Subproject commit 9ab08a3626dde1d38dd622b65e425277cd029722

@ -1,12 +1,18 @@
import os
import subprocess
import sys
from pathlib import Path
DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley")
DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest")
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
# xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
# to just roll it on our own.
XDG_CONFIG_HOME = os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config")
XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME", Path.home() / ".local" / "share")
sub_images = {
"ocrd": "core",
@ -56,6 +62,8 @@ def main():
sub_image = sub_images[argv[0]]
docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG)
if DOCKER_IMAGE_TAG != "latest":
print(f"Using {docker_image}")
docker_run(argv, docker_image)
@ -67,6 +75,13 @@ def docker_run(argv, docker_image):
docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL])
docker_run_options.extend(["-e", "_OCRD_COMPLETE"])
docker_run_options.extend(["-e", "XDG_CONFIG_HOME=%s" % XDG_CONFIG_HOME])
docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
(XDG_CONFIG_HOME, XDG_CONFIG_HOME)])
docker_run_options.extend(["-e", "XDG_DATA_HOME=%s" % XDG_DATA_HOME])
docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
(XDG_DATA_HOME, XDG_DATA_HOME)])
# JAVA_TOOL_OPTIONS is used for Java proxy settings
if os.environ.get("JAVA_TOOL_OPTIONS"):
docker_run_options.extend(["-e", "JAVA_TOOL_OPTIONS"])

Loading…
Cancel
Save