Merge branch 'test/github-actions'

2026-02-06 01:22:18 +01:00 · 2023-03-15 19:41:05 +01:00 · 2023-03-15 19:41:05 +01:00 · 0cfeb76bba
commit 0cfeb76bba
parent e9156ebaa3 2a020954fb
23 changed files with 264 additions and 175 deletions
--- a/.drone.star
+++ b/.drone.star
@ -1,74 +0,0 @@
 def main(ctx):
  tags = [ctx.build.commit]
  if ctx.build.event == "tag":
    name = "release"
  elif ctx.build.branch == "master":
    name = "master"
    tags.append("latest")
  else:
    return
  return [
    {
      "kind": "pipeline",
      "name": name,
      "steps": [
        {
          "name":  "prepare data",
          "image": "alpine",
          "commands": [
            "apk update && apk add bash curl",
            "FORCE_DOWNLOAD=y ./build-tmp-XXX"
          ]
        },
        # We can't glob and have to add here manually...
        step_for(ctx, "core", tags),
        step_for(ctx, "core-cuda10.0", tags),
        step_for(ctx, "core-cuda10.1", tags),
        step_for(ctx, "dinglehopper", tags),
        step_for(ctx, "ocrd_calamari", tags),
        step_for(ctx, "ocrd_calamari03", tags),
        step_for(ctx, "ocrd_cis", tags),
        step_for(ctx, "ocrd_fileformat", tags),
        step_for(ctx, "ocrd_olena", tags),
        step_for(ctx, "ocrd_segment", tags),
        step_for(ctx, "ocrd_tesserocr", tags),
        step_for(ctx, "ocrd_wrap", tags),
        step_for(ctx, "sbb_binarization", tags),
        step_for(ctx, "sbb_textline_detector", tags),
        step_for(ctx, "eynollah", tags),
        step_for(ctx, "ocrd_anybaseocr", tags),
        {
          "name": "notify",
          "image": "drillster/drone-email",
          "settings": {
            "host": "172.17.0.1",
            "port": "25",
            "from": "drone@ci.moegen-wir.net",
          },
          "when": {
            "status": [ "success", "failure" ]
          }
        }
      ]
    }
  ]
 def step_for(ctx, sub_image, tags):
  return {
    "name": "build %s" % sub_image,
    "image": "plugins/docker",
    "settings": {
      "build_args": [
        "DRONE_COMMIT=%s" % ctx.build.commit,
      ],
      "tags": tags,
      "username": { "from_secret": "docker_username" },
      "password": { "from_secret": "docker_password" },
      "repo": "quratorspk/ocrd-galley-%s" % sub_image,
      "dockerfile": "Dockerfile-%s" % sub_image,
    }
  }
--- a/.github/list-subimages
+++ b/.github/list-subimages
@ -0,0 +1,38 @@
 #!/usr/bin/python3
 import glob
 import re
 import sys
 import argparse
 import json
 all_subimages = {re.sub(r"^Dockerfile-", "", dockerfile) for dockerfile in glob.glob("Dockerfile-*")}
 core_subimages = {si for si in all_subimages if si.startswith("core")}
 rest_subimages = all_subimages - core_subimages
 parser = argparse.ArgumentParser(description='List subimages.')
 parser.add_argument('--core', action='store_true',
                    default=False, help='List core subimages')
 parser.add_argument('--rest', action='store_true',
                    default=False, help='List rest subimages')
 parser.add_argument('--json', action='store_true',
                    default=False, help='Return list as JSON')
 args = parser.parse_args()
 def list_(subimages):
    subimages = sorted(subimages)
    if args.json:
        print(json.dumps(subimages))
    else:
        print("\n".join(subimages))
 if not args.core and not args.rest:
    list_(core_subimages | rest_subimages)
 if args.core:
    list_(core_subimages)
 if args.rest:
    list_(rest_subimages)
--- a/.github/workflows/build-subimage.yml
+++ b/.github/workflows/build-subimage.yml
@ -0,0 +1,58 @@
 on:
  workflow_call:
    inputs:
      subimage:
        required: true
        type: string
      tags:
        required: true
        type: string
    secrets:
      DOCKERHUB_USERNAME:
        required: true
      DOCKERHUB_TOKEN:
        required: true
 jobs:
  build-subimage-job:
    runs-on: ubuntu-latest
    steps:
      -
        name: Checkout
        uses: actions/checkout@v3
        # We are checking out explicitly, so build-push-action isn't trying
        # to checkout the (unreachable) submodule. (Using "context" there.)
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      -
        name: Docker meta
        id: meta
        uses: docker/metadata-action@v4
        with:
          images: |
            quratorspk/ocrd-galley-${{ inputs.subimage }}
          flavor: |
            latest=auto
          # latest=auto should generate "latest" for the type=semver tags entry
          tags: ${{ inputs.tags }}
      -
        name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      -
        name: Build ${{ inputs.subimage }}
        uses: docker/build-push-action@v4
        with:
          context: .
          file: Dockerfile-${{ inputs.subimage }}
          build-args: |
            GIT_COMMIT=sha-${{ github.sha }}
            BUILDKIT_INLINE_CACHE=1
          tags: ${{ steps.meta.outputs.tags }}
          push: true
          cache-from: quratorspk/ocrd-galley-${{ inputs.subimage }}:sha-${{ github.sha }}
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,6 +1,7 @@
 name: build
 on:
  workflow_dispatch:
  push:
@ -9,28 +10,93 @@ on:
      - 'test/github-actions'
 jobs:
-  docker:
+  matrix:
    runs-on: ubuntu-latest
    outputs:
      core: ${{ steps.step1.outputs.core }}
      rest: ${{ steps.step1.outputs.rest }}
      all: ${{ steps.step1.outputs.all }}
    steps:
      -
        name: Checkout
        uses: actions/checkout@v3
        # We are checking out explicitly, so build-push-action isn't trying
        # to checkout the (unreachable) submodule. (Using "context" there.)
      -
-        name: Set up Docker Buildx
+        name: Generate outputs
-        uses: docker/setup-buildx-action@v2
+        id: step1
        run: |
          echo "core=$(./.github/list-subimages --core --json)" >>$GITHUB_OUTPUT
          echo "rest=$(./.github/list-subimages --rest --json)" >>$GITHUB_OUTPUT
          echo "all=$(./.github/list-subimages --json)" >>$GITHUB_OUTPUT
-      # TODO data
+          echo "GITHUB_OUTPUT:"
-      # TODO matrix for all Dockerfiles
+          cat $GITHUB_OUTPUT
-      -
+  build-core:
-        name: Build
+    needs: matrix
-        uses: docker/build-push-action@v4
+    strategy:
      matrix:
        subimage: ${{ fromJson(needs.matrix.outputs.core) }}
    uses: ./.github/workflows/build-subimage.yml
    with:
-          context: .
+      subimage: ${{ matrix.subimage }}
-          file: Dockerfile-core
+      tags: |
-          build-args:
+        type=sha,format=long
-            DRONE_COMMIT=${{ github.sha }}
+      # Here: NOT the full tags, just the sha! (they get added below)
-          push: false
+    secrets: inherit
  build-rest:
    needs: [matrix, build-core]
    strategy:
      matrix:
        subimage: ${{ fromJson(needs.matrix.outputs.rest) }}
    uses: ./.github/workflows/build-subimage.yml
    with:
      subimage: ${{ matrix.subimage }}
      tags: |
        type=sha,format=long
    secrets: inherit
  test:
    needs: build-rest
    runs-on: ubuntu-latest
    env:
      DOCKER_IMAGE_TAG: sha-${{ github.sha }}  # needed to run the correct version through the wrapper
    steps:
      -
        name: Checkout
        uses: actions/checkout@v3
      -
        name: Install wrapper
        run: |
          sudo apt-get install -y python3-pip
          cd wrapper && pip install .
      -
        name: Test
        run: |
          ocrd --version
          ocrd-dinglehopper --version
  # At this point, we have successfully built, uploaded and tested the images. We now just need to add
  # tags. We do this by building again, but using the formerly built images to
  # cache from.
  push-with-tags:
    needs: [matrix, test]
    strategy:
      matrix:
        subimage: ${{ fromJson(needs.matrix.outputs.all) }}
    uses: ./.github/workflows/build-subimage.yml
    with:
      subimage: ${{ matrix.subimage }}
      tags: |
        type=sha,format=long
        type=edge,branch=master
        type=ref,event=branch
        type=semver,pattern={{version}}
        # Here: full tags
        # Note: Do NOT use event=tag here, unless re-configuring the "latest"
        # behavior too as that triggers on event=tag by default. By default,
        # "latest" triggers on type=semver here, too (which is wanted).
    secrets: inherit
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
 [submodule "data"]
 	path = data
 	url = git@code.dev.sbb.berlin:qurator/qurator-data.git
--- a/40
+++ b/40
@ -9,10 +9,10 @@ ENV PIP_DEFAULT_TIMEOUT=120
 RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
    apt-get update && \
    apt-get install -y \
-      curl xz-utils \
+      build-essential \
-      build-essential python3-dev \
+      curl \
-# For get-pip.py:
+      git \
-      python3-distutils \
+      xz-utils \
 # For add-apt-repository:
      software-properties-common \
 # XML utils
@ -20,6 +20,17 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
      xmlstarlet \
 # OCR-D uses ImageMagick for pixel density estimation
      imagemagick \
 # pyenv builds
 # TODO: builder container?
      libz-dev \
      libssl-dev \
      libbz2-dev \
      liblzma-dev \
      libncurses-dev \
      libffi-dev \
      libreadline-dev \
      libsqlite3-dev \
      libmagic-dev \
    && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@ -29,14 +40,19 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \
 RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py
-# Install pip (and setuptools)
+# Install pyenv
-# We use get-pip.py here to avoid
+# TODO: do not run as root
-# a. having to upgrade from Ubuntu's pip
+# TODO: does just saying "3.7" work as intended?
-# b. the dreaded "old script wrapper" error message
+ENV HOME=/root
-RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \
+ENV PYENV_ROOT=/usr/local/share/pyenv
-    python3 get-pip.py && \
+ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
-    rm -f get-pip.py
+RUN \
-
+    git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \
    pyenv install 3.7 && \
    pyenv global 3.7 && \
    pyenv rehash && \
    pip install -U pip && \
    pip install setuptools
 # Install pip installable-stuff
 RUN ${PIP_INSTALL} \
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG DINGLEHOPPER_COMMIT="dcc10c5"
--- a/8
+++ b/8
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG EYNOLLAH_VERSION="0.0.10"
@ -10,10 +10,6 @@ RUN ${PIP_INSTALL} \
    "eynollah == ${EYNOLLAH_VERSION}"
 # Copy OCR models
 COPY data/eynollah /var/lib/eynollah
 # Check pip dependencies
 RUN pip check
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core-cuda10.1:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG OCRD_ANYBASEOCR_VERSION="1.8.2"
--- a/12
+++ b/12
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core-cuda10.1:$GIT_COMMIT
 # XXX https://github.com/OCR-D/core/issues/642
@ -12,14 +12,6 @@ RUN ${PIP_INSTALL} \
        "ocrd_calamari == $OCRD_CALAMARI_VERSION"
 # Copy OCR models
 RUN mkdir -p /var/lib/calamari-models/GT4HistOCR
 COPY data/calamari-models/GT4HistOCR/2019-12-11T11_10+0100 /var/lib/calamari-models/GT4HistOCR/2019-12-11T11_10+0100
 # XXX experimental
 #COPY data/calamari-models/GT4HistOCR/2019-12-18T17_24+0100-with-augmentation-UNTESTED /var/lib/calamari-models/GT4HistOCR/2019-12-18T17_24+0100
 #COPY data/mirror/github.com/Calamari-OCR/calamari_models/gt4histocr                   /var/lib/calamari-models/GT4HistOCR-chreul
 # Check pip dependencies
 RUN pip check
--- a/10
+++ b/10
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
@ -13,12 +13,6 @@ RUN ${PIP_INSTALL} \
        'ocrd_calamari == 0.0.7'
 # Copy OCR models
 RUN mkdir -p /var/lib/calamari-models/GT4HistOCR
 COPY data/calamari-models/GT4HistOCR/2019-07-22T15_49+0200 /var/lib/calamari-models/GT4HistOCR/2019-07-22T15_49+0200
 # Check pip dependencies
 RUN pip check
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG OCRD_CIS_VERSION="0.1.5"
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG OCRD_FILEFORMAT_VERSION="0.5.0"
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG OCRD_OLENA_VERSION="1.3.0"
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG OCRD_SEGMENT_VERSION="0.1.21"
--- a/11
+++ b/11
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG TESSDATA_BEST_VERSION="4.0.0"
@ -17,13 +17,6 @@ RUN add-apt-repository ppa:alex-p/tesseract-ocr && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
 # Copy OCR models
 RUN mkdir -p $TESSDATA_PREFIX
 ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/
 COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/
 RUN curl -sSL -O https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.502_198857.traineddata && \
    mv *.traineddata $TESSDATA_PREFIX/
 # Build pip installable stuff
 RUN ${PIP_INSTALL} \
    "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}"
--- a/4
+++ b/4
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG OCRD_WRAP_VERSION="0.1.7"
--- a/8
+++ b/8
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG SBB_BINARIZATION_VERSION="0.0.10"
@ -11,10 +11,6 @@ RUN ${PIP_INSTALL} \
    "sbb_binarization == $SBB_BINARIZATION_VERSION"
 # Copy models
 COPY data/sbb_binarization/2021-03-09 /var/lib/sbb_binarization
 # Check pip dependencies
 RUN pip check
--- a/8
+++ b/8
@ -1,5 +1,5 @@
-ARG DRONE_COMMIT="latest"
+ARG GIT_COMMIT="latest"
-FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT
+FROM quratorspk/ocrd-galley-core-cuda10.0:$GIT_COMMIT
 ARG PIP_INSTALL="pip install --no-cache-dir"
 ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6"
@ -12,10 +12,6 @@ RUN ${PIP_INSTALL} \
    https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz
 # Copy OCR models
 COPY data/textline_detection /var/lib/textline_detection
 # Check pip dependencies
 RUN pip check
--- a/README.md
+++ b/README.md
@ -29,14 +29,9 @@ including all dependencies in Docker.
 How to use
 ----------
-**Currently, due to problems with the Travis CI, we do not provide pre-built
+ocrd-galley uses Docker to run the OCR-D images. We provide pre-built container
-containers anymore.***
+images that get downloaded automatically when you run the provided wrappers for
-
+the OCR-D processors.
 To build the containers yourself using Docker:
 ~~~
 cd ~/devel/ocrd-galley/
 ./build
 ~~~
 You can then install the wrappers into a Python venv:
 ~~~
@ -44,9 +39,13 @@ cd ~/devel/ocrd-galley/wrapper
 pip install .
 ~~~
 To download models, you need to use the `-a` flag of `ocrd resmgr`:
 ~~~
 ocrd resmgr download -a ocrd-calamari-recognize default
 ~~~
 You may then use the script `my_ocrd_workflow` to use your self-built
 containers on an example workspace:
 ~~~
 # Download an example workspace
 cd /tmp
@ -110,3 +109,11 @@ cd workspace-xxxxx  # output by the last command
 ~~~
 This produces a workspace from the files and then runs the OCR workflow on it.
 Build the containers yourself
 -----------------------------
 To build the containers yourself using Docker:
 ~~~
 cd ~/devel/ocrd-galley/
 ./build
 ~~~
--- a/2
+++ b/2
@ -45,7 +45,7 @@ get_from_web() {
  download_to --strip-components 1 'https://qurator-data.de/eynollah/models_eynollah.tar.gz' 'eynollah'
 }
 . $self_dir/qurator_data_lib.sh
-handle_data
+#handle_data
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit 9ab08a3626dde1d38dd622b65e425277cd029722
--- a/wrapper/qurator/ocrd_galley/cli.py
+++ b/wrapper/qurator/ocrd_galley/cli.py
@ -1,12 +1,18 @@
 import os
 import subprocess
 import sys
 from pathlib import Path
 DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley")
 DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest")
 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
 # to just roll it on our own.
 XDG_CONFIG_HOME = os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config")
 XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME", Path.home() / ".local" / "share")
 sub_images = {
        "ocrd": "core",
@ -56,6 +62,8 @@ def main():
    sub_image = sub_images[argv[0]]
    docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG)
    if DOCKER_IMAGE_TAG != "latest":
        print(f"Using {docker_image}")
    docker_run(argv, docker_image)
@ -67,6 +75,13 @@ def docker_run(argv, docker_image):
    docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL])
    docker_run_options.extend(["-e", "_OCRD_COMPLETE"])
    docker_run_options.extend(["-e", "XDG_CONFIG_HOME=%s" % XDG_CONFIG_HOME])
    docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
        (XDG_CONFIG_HOME, XDG_CONFIG_HOME)])
    docker_run_options.extend(["-e", "XDG_DATA_HOME=%s" % XDG_DATA_HOME])
    docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" %
        (XDG_DATA_HOME, XDG_DATA_HOME)])
    # JAVA_TOOL_OPTIONS is used for Java proxy settings
    if os.environ.get("JAVA_TOOL_OPTIONS"):
        docker_run_options.extend(["-e", "JAVA_TOOL_OPTIONS"])
		`@ -1 +0,0 @@`
			`Subproject commit 9ab08a3626dde1d38dd622b65e425277cd029722`