diff --git a/.drone.star b/.drone.star deleted file mode 100644 index 623c861..0000000 --- a/.drone.star +++ /dev/null @@ -1,74 +0,0 @@ -def main(ctx): - tags = [ctx.build.commit] - - if ctx.build.event == "tag": - name = "release" - elif ctx.build.branch == "master": - name = "master" - tags.append("latest") - else: - return - - return [ - { - "kind": "pipeline", - "name": name, - "steps": [ - { - "name": "prepare data", - "image": "alpine", - "commands": [ - "apk update && apk add bash curl", - "FORCE_DOWNLOAD=y ./build-tmp-XXX" - ] - }, - # We can't glob and have to add here manually... - step_for(ctx, "core", tags), - step_for(ctx, "core-cuda10.0", tags), - step_for(ctx, "core-cuda10.1", tags), - - step_for(ctx, "dinglehopper", tags), - step_for(ctx, "ocrd_calamari", tags), - step_for(ctx, "ocrd_calamari03", tags), - step_for(ctx, "ocrd_cis", tags), - step_for(ctx, "ocrd_fileformat", tags), - step_for(ctx, "ocrd_olena", tags), - step_for(ctx, "ocrd_segment", tags), - step_for(ctx, "ocrd_tesserocr", tags), - step_for(ctx, "ocrd_wrap", tags), - step_for(ctx, "sbb_binarization", tags), - step_for(ctx, "sbb_textline_detector", tags), - step_for(ctx, "eynollah", tags), - step_for(ctx, "ocrd_anybaseocr", tags), - { - "name": "notify", - "image": "drillster/drone-email", - "settings": { - "host": "172.17.0.1", - "port": "25", - "from": "drone@ci.moegen-wir.net", - }, - "when": { - "status": [ "success", "failure" ] - } - } - ] - } - ] - - -def step_for(ctx, sub_image, tags): - return { - "name": "build %s" % sub_image, - "image": "plugins/docker", - "settings": { - "build_args": [ - "DRONE_COMMIT=%s" % ctx.build.commit, - ], - "tags": tags, - "username": { "from_secret": "docker_username" }, - "password": { "from_secret": "docker_password" }, - "repo": "quratorspk/ocrd-galley-%s" % sub_image, - "dockerfile": "Dockerfile-%s" % sub_image, - } - } diff --git a/.github/list-subimages b/.github/list-subimages new file mode 100755 index 0000000..7966019 --- /dev/null +++ b/.github/list-subimages @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +import glob +import re +import sys +import argparse +import json + + +all_subimages = {re.sub(r"^Dockerfile-", "", dockerfile) for dockerfile in glob.glob("Dockerfile-*")} +core_subimages = {si for si in all_subimages if si.startswith("core")} +rest_subimages = all_subimages - core_subimages + + + +parser = argparse.ArgumentParser(description='List subimages.') +parser.add_argument('--core', action='store_true', + default=False, help='List core subimages') +parser.add_argument('--rest', action='store_true', + default=False, help='List rest subimages') +parser.add_argument('--json', action='store_true', + default=False, help='Return list as JSON') +args = parser.parse_args() + + +def list_(subimages): + subimages = sorted(subimages) + if args.json: + print(json.dumps(subimages)) + else: + print("\n".join(subimages)) + + +if not args.core and not args.rest: + list_(core_subimages | rest_subimages) +if args.core: + list_(core_subimages) +if args.rest: + list_(rest_subimages) diff --git a/.github/workflows/build-subimage.yml b/.github/workflows/build-subimage.yml new file mode 100644 index 0000000..0fe98ce --- /dev/null +++ b/.github/workflows/build-subimage.yml @@ -0,0 +1,58 @@ +on: + workflow_call: + inputs: + subimage: + required: true + type: string + tags: + required: true + type: string + secrets: + DOCKERHUB_USERNAME: + required: true + DOCKERHUB_TOKEN: + required: true + + +jobs: + build-subimage-job: + runs-on: ubuntu-latest + steps: + - + name: Checkout + uses: actions/checkout@v3 + # We are checking out explicitly, so build-push-action isn't trying + # to checkout the (unreachable) submodule. (Using "context" there.) + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Docker meta + id: meta + uses: docker/metadata-action@v4 + with: + images: | + quratorspk/ocrd-galley-${{ inputs.subimage }} + flavor: | + latest=auto + # latest=auto should generate "latest" for the type=semver tags entry + tags: ${{ inputs.tags }} + - + name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - + name: Build ${{ inputs.subimage }} + uses: docker/build-push-action@v4 + with: + context: . + file: Dockerfile-${{ inputs.subimage }} + build-args: | + GIT_COMMIT=sha-${{ github.sha }} + BUILDKIT_INLINE_CACHE=1 + tags: ${{ steps.meta.outputs.tags }} + push: true + + cache-from: quratorspk/ocrd-galley-${{ inputs.subimage }}:sha-${{ github.sha }} diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 920e0b0..d1a90d3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,36 +1,104 @@ name: build on: + workflow_dispatch: push: branches: - 'master' - - 'test/github-actions' + - 'fix/*' jobs: - docker: + matrix: runs-on: ubuntu-latest + outputs: + core: ${{ steps.step1.outputs.core }} + rest: ${{ steps.step1.outputs.rest }} + all: ${{ steps.step1.outputs.all }} steps: - name: Checkout uses: actions/checkout@v3 - # We are checking out explicitly, so build-push-action isn't trying - # to checkout the (unreachable) submodule. (Using "context" there.) - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + name: Generate outputs + id: step1 + run: | + echo "core=$(./.github/list-subimages --core --json)" >>$GITHUB_OUTPUT + echo "rest=$(./.github/list-subimages --rest --json)" >>$GITHUB_OUTPUT + echo "all=$(./.github/list-subimages --json)" >>$GITHUB_OUTPUT + + echo "GITHUB_OUTPUT:" + cat $GITHUB_OUTPUT + + build-core: + needs: matrix + strategy: + fail-fast: false + matrix: + subimage: ${{ fromJson(needs.matrix.outputs.core) }} + uses: ./.github/workflows/build-subimage.yml + with: + subimage: ${{ matrix.subimage }} + tags: | + type=sha,format=long + # Here: NOT the full tags, just the sha! (they get added below) + secrets: inherit - # TODO data - # TODO matrix for all Dockerfiles + build-rest: + needs: [matrix, build-core] + strategy: + fail-fast: false + matrix: + subimage: ${{ fromJson(needs.matrix.outputs.rest) }} + uses: ./.github/workflows/build-subimage.yml + with: + subimage: ${{ matrix.subimage }} + tags: | + type=sha,format=long + secrets: inherit + + test: + needs: build-rest + runs-on: ubuntu-latest + env: + DOCKER_IMAGE_TAG: sha-${{ github.sha }} # needed to run the correct version through the wrapper + steps: - - name: Build - uses: docker/build-push-action@v4 - with: - context: . - file: Dockerfile-core - build-args: - DRONE_COMMIT=${{ github.sha }} - push: false + name: Checkout + uses: actions/checkout@v3 + - + name: Install wrapper + run: | + sudo apt-get install -y python3-pip + cd wrapper && pip install . + - + name: Test + run: | + ocrd --version + ocrd-dinglehopper --version + + + # At this point, we have successfully built, uploaded and tested the images. We now just need to add + # tags. We do this by building again, but using the formerly built images to + # cache from. + + push-with-tags: + needs: [matrix, test] + strategy: + matrix: + subimage: ${{ fromJson(needs.matrix.outputs.all) }} + uses: ./.github/workflows/build-subimage.yml + with: + subimage: ${{ matrix.subimage }} + tags: | + type=sha,format=long + type=edge,branch=master + type=ref,event=branch + type=semver,pattern={{version}} + # Here: full tags + # Note: Do NOT use event=tag here, unless re-configuring the "latest" + # behavior too as that triggers on event=tag by default. By default, + # "latest" triggers on type=semver here, too (which is wanted). + secrets: inherit diff --git a/.gitignore b/.gitignore index f37cdb5..42fe1ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +build/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index a08c53c..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "data"] - path = data - url = git@code.dev.sbb.berlin:qurator/qurator-data.git diff --git a/Dockerfile-core b/Dockerfile-core index 443db85..c655d56 100644 --- a/Dockerfile-core +++ b/Dockerfile-core @@ -1,7 +1,7 @@ -FROM ubuntu:18.04 +FROM ubuntu:22.04 ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.23.3" +ARG OCRD_VERSION_MINIMUM="2.47.0" ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 ENV PIP_DEFAULT_TIMEOUT=120 @@ -9,10 +9,11 @@ ENV PIP_DEFAULT_TIMEOUT=120 RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y \ - curl xz-utils \ - build-essential python3-dev \ -# For get-pip.py: - python3-distutils \ + build-essential \ + curl \ + git \ + xz-utils \ + pkg-config \ # For add-apt-repository: software-properties-common \ # XML utils @@ -20,6 +21,17 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ xmlstarlet \ # OCR-D uses ImageMagick for pixel density estimation imagemagick \ +# pyenv builds +# TODO: builder container? + libz-dev \ + libssl-dev \ + libbz2-dev \ + liblzma-dev \ + libncurses-dev \ + libffi-dev \ + libreadline-dev \ + libsqlite3-dev \ + libmagic-dev \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -29,14 +41,19 @@ RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py -# Install pip (and setuptools) -# We use get-pip.py here to avoid -# a. having to upgrade from Ubuntu's pip -# b. the dreaded "old script wrapper" error message -RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \ - python3 get-pip.py && \ - rm -f get-pip.py - +# Install pyenv +# TODO: do not run as root +# TODO: does just saying "3.7" work as intended? +ENV HOME=/root +ENV PYENV_ROOT=/usr/local/share/pyenv +ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH +RUN \ + git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ + pyenv install 3.7 && \ + pyenv global 3.7 && \ + pyenv rehash && \ + pip install -U pip wheel && \ + pip install setuptools # Install pip installable-stuff RUN ${PIP_INSTALL} \ diff --git a/Dockerfile-core-cuda10.0 b/Dockerfile-core-cuda10.0 deleted file mode 100644 index f1a2d60..0000000 --- a/Dockerfile-core-cuda10.0 +++ /dev/null @@ -1,53 +0,0 @@ -FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.23.3" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - curl xz-utils \ - build-essential python3-dev \ -# For get-pip.py: - python3-distutils \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pip (and setuptools) -# We use get-pip.py here to avoid -# a. having to upgrade from Ubuntu's pip -# b. the dreaded "old script wrapper" error message -RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \ - python3 get-pip.py && \ - rm -f get-pip.py - - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] diff --git a/Dockerfile-core-cuda10.1 b/Dockerfile-core-cuda10.1 deleted file mode 100644 index 569a567..0000000 --- a/Dockerfile-core-cuda10.1 +++ /dev/null @@ -1,53 +0,0 @@ -FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 - -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_VERSION_MINIMUM="2.23.3" -ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 -ENV PIP_DEFAULT_TIMEOUT=120 - - -RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ - apt-get update && \ - apt-get install -y \ - curl xz-utils \ - build-essential python3-dev \ -# For get-pip.py: - python3-distutils \ -# For add-apt-repository: - software-properties-common \ -# XML utils - libxml2-utils \ - xmlstarlet \ -# OCR-D uses ImageMagick for pixel density estimation - imagemagick \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -# Set up OCR-D logging -RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py - - -# Install pip (and setuptools) -# We use get-pip.py here to avoid -# a. having to upgrade from Ubuntu's pip -# b. the dreaded "old script wrapper" error message -RUN curl -sSL https://bootstrap.pypa.io/pip/3.6/get-pip.py -o get-pip.py && \ - python3 get-pip.py && \ - rm -f get-pip.py - - -# Install pip installable-stuff -RUN ${PIP_INSTALL} \ - "ocrd >= ${OCRD_VERSION_MINIMUM}" - - -# Check pip dependencies -RUN pip check - - -WORKDIR /data - -# Default command -CMD ['ocrd'] diff --git a/Dockerfile-core-cuda12.1 b/Dockerfile-core-cuda12.1 new file mode 100644 index 0000000..c494a2c --- /dev/null +++ b/Dockerfile-core-cuda12.1 @@ -0,0 +1,70 @@ +FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 + +ARG PIP_INSTALL="pip install --no-cache-dir" +ARG OCRD_VERSION_MINIMUM="2.47.0" +ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 +ENV PIP_DEFAULT_TIMEOUT=120 + + +RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ + apt-get update && \ + apt-get install -y \ + build-essential \ + curl \ + git \ + xz-utils \ + pkg-config \ +# For add-apt-repository: + software-properties-common \ +# XML utils + libxml2-utils \ + xmlstarlet \ +# OCR-D uses ImageMagick for pixel density estimation + imagemagick \ +# pyenv builds +# TODO: builder container? + libz-dev \ + libssl-dev \ + libbz2-dev \ + liblzma-dev \ + libncurses-dev \ + libffi-dev \ + libreadline-dev \ + libsqlite3-dev \ + libmagic-dev \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# Set up OCR-D logging +RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py + + +# Install pyenv +# TODO: do not run as root +# TODO: does just saying "3.7" work as intended? +ENV HOME=/root +ENV PYENV_ROOT=/usr/local/share/pyenv +ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH +RUN \ + git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ + pyenv install 3.7 && \ + pyenv global 3.7 && \ + pyenv rehash && \ + pip install -U pip wheel && \ + pip install setuptools + +# Install pip installable-stuff +RUN ${PIP_INSTALL} \ + "ocrd >= ${OCRD_VERSION_MINIMUM}" + + +# Check pip dependencies +RUN pip check + + +WORKDIR /data + +# Default command +CMD ['ocrd'] diff --git a/Dockerfile-dinglehopper b/Dockerfile-dinglehopper index d7c9bc1..765a1f2 100644 --- a/Dockerfile-dinglehopper +++ b/Dockerfile-dinglehopper @@ -1,14 +1,13 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" -ARG DINGLEHOPPER_COMMIT="dcc10c5" +ARG DINGLEHOPPER_VERSION="0.9.2" # Build pip installable stuff RUN ${PIP_INSTALL} \ -# Now the real stuff: - https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz + "dinglehopper == $DINGLEHOPPER_VERSION" # Check pip dependencies diff --git a/Dockerfile-eynollah b/Dockerfile-eynollah index 1ce84d2..6505174 100644 --- a/Dockerfile-eynollah +++ b/Dockerfile-eynollah @@ -1,8 +1,8 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" -ARG EYNOLLAH_VERSION="0.0.10" +ARG EYNOLLAH_VERSION="0.3.0" # Build pip installable stuff @@ -10,10 +10,6 @@ RUN ${PIP_INSTALL} \ "eynollah == ${EYNOLLAH_VERSION}" -# Copy OCR models -COPY data/eynollah /var/lib/eynollah - - # Check pip dependencies RUN pip check diff --git a/Dockerfile-ocrd_anybaseocr b/Dockerfile-ocrd_anybaseocr index fb14dd7..6ce5d0e 100644 --- a/Dockerfile-ocrd_anybaseocr +++ b/Dockerfile-ocrd_anybaseocr @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG OCRD_ANYBASEOCR_VERSION="1.8.2" diff --git a/Dockerfile-ocrd_calamari b/Dockerfile-ocrd_calamari index fa38414..3b9d9cc 100644 --- a/Dockerfile-ocrd_calamari +++ b/Dockerfile-ocrd_calamari @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda10.1:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT # XXX https://github.com/OCR-D/core/issues/642 @@ -12,14 +12,6 @@ RUN ${PIP_INSTALL} \ "ocrd_calamari == $OCRD_CALAMARI_VERSION" -# Copy OCR models -RUN mkdir -p /var/lib/calamari-models/GT4HistOCR -COPY data/calamari-models/GT4HistOCR/2019-12-11T11_10+0100 /var/lib/calamari-models/GT4HistOCR/2019-12-11T11_10+0100 -# XXX experimental -#COPY data/calamari-models/GT4HistOCR/2019-12-18T17_24+0100-with-augmentation-UNTESTED /var/lib/calamari-models/GT4HistOCR/2019-12-18T17_24+0100 -#COPY data/mirror/github.com/Calamari-OCR/calamari_models/gt4histocr /var/lib/calamari-models/GT4HistOCR-chreul - - # Check pip dependencies RUN pip check diff --git a/Dockerfile-ocrd_calamari03 b/Dockerfile-ocrd_calamari03 index 43389ad..5a8be3d 100644 --- a/Dockerfile-ocrd_calamari03 +++ b/Dockerfile-ocrd_calamari03 @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" @@ -13,12 +13,6 @@ RUN ${PIP_INSTALL} \ 'ocrd_calamari == 0.0.7' -# Copy OCR models -RUN mkdir -p /var/lib/calamari-models/GT4HistOCR -COPY data/calamari-models/GT4HistOCR/2019-07-22T15_49+0200 /var/lib/calamari-models/GT4HistOCR/2019-07-22T15_49+0200 - - - # Check pip dependencies RUN pip check diff --git a/Dockerfile-ocrd_cis b/Dockerfile-ocrd_cis index 5790d74..e967893 100644 --- a/Dockerfile-ocrd_cis +++ b/Dockerfile-ocrd_cis @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG OCRD_CIS_VERSION="0.1.5" diff --git a/Dockerfile-ocrd_fileformat b/Dockerfile-ocrd_fileformat index 0275598..060f79c 100644 --- a/Dockerfile-ocrd_fileformat +++ b/Dockerfile-ocrd_fileformat @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG OCRD_FILEFORMAT_VERSION="0.5.0" diff --git a/Dockerfile-ocrd_olena b/Dockerfile-ocrd_olena index b9f2dcc..7d74145 100644 --- a/Dockerfile-ocrd_olena +++ b/Dockerfile-ocrd_olena @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG OCRD_OLENA_VERSION="1.3.0" @@ -26,6 +26,7 @@ RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena cd ocrd_olena && \ sed -i 's/^install: deps$/install:/' Makefile && \ ${PIP_INSTALL} ocrd && \ + make deps-ubuntu && \ make install PREFIX=/usr/local && \ cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz diff --git a/Dockerfile-ocrd_segment b/Dockerfile-ocrd_segment index 2dfee9e..44699ff 100644 --- a/Dockerfile-ocrd_segment +++ b/Dockerfile-ocrd_segment @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG OCRD_SEGMENT_VERSION="0.1.21" diff --git a/Dockerfile-ocrd_tesserocr b/Dockerfile-ocrd_tesserocr index 32b8e45..c046cfc 100644 --- a/Dockerfile-ocrd_tesserocr +++ b/Dockerfile-ocrd_tesserocr @@ -1,15 +1,17 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG TESSDATA_BEST_VERSION="4.0.0" -ARG OCRD_TESSEROCR_VERSION="0.16.0" +ARG OCRD_TESSEROCR_VERSION="0.17.0" ENV TESSDATA_PREFIX /usr/local/share/tessdata # Install Leptonica and Tesseract. -RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ - apt-get update && \ +# TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1, +# alex-p has 4.1.3, but not for jammy.) +# RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ +RUN apt-get update && \ apt-get install -y \ tesseract-ocr \ libtesseract-dev \ @@ -17,18 +19,10 @@ RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ apt-get clean && rm -rf /var/lib/apt/lists/* -# Copy OCR models -RUN mkdir -p $TESSDATA_PREFIX -ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ -COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ -RUN curl -sSL -O https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.502_198857.traineddata && \ - mv *.traineddata $TESSDATA_PREFIX/ - # Build pip installable stuff RUN ${PIP_INSTALL} \ "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}" - # Check pip dependencies RUN pip check diff --git a/Dockerfile-ocrd_trocr b/Dockerfile-ocrd_trocr new file mode 100644 index 0000000..fc05759 --- /dev/null +++ b/Dockerfile-ocrd_trocr @@ -0,0 +1,18 @@ +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT + +ARG PIP_INSTALL="pip install --no-cache-dir" +ARG OCRD_TROCR_COMMIT="250ff1c" + + +# Build pip installable stuff +RUN ${PIP_INSTALL} \ + https://github.com/qurator-spk/ocrd_trocr/archive/$OCRD_TROCR_COMMIT.tar.gz + + +# Check pip dependencies +RUN pip check + + +# Default command +CMD ["ocrd-trocr-recognize"] diff --git a/Dockerfile-ocrd_wrap b/Dockerfile-ocrd_wrap index d38bc2c..518d306 100644 --- a/Dockerfile-ocrd_wrap +++ b/Dockerfile-ocrd_wrap @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG OCRD_WRAP_VERSION="0.1.7" diff --git a/Dockerfile-sbb_binarization b/Dockerfile-sbb_binarization index ff298ca..103b8de 100644 --- a/Dockerfile-sbb_binarization +++ b/Dockerfile-sbb_binarization @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG SBB_BINARIZATION_VERSION="0.0.10" @@ -11,10 +11,6 @@ RUN ${PIP_INSTALL} \ "sbb_binarization == $SBB_BINARIZATION_VERSION" -# Copy models -COPY data/sbb_binarization/2021-03-09 /var/lib/sbb_binarization - - # Check pip dependencies RUN pip check diff --git a/Dockerfile-sbb_textline_detector b/Dockerfile-sbb_textline_detector index d206131..0569ab8 100644 --- a/Dockerfile-sbb_textline_detector +++ b/Dockerfile-sbb_textline_detector @@ -1,5 +1,5 @@ -ARG DRONE_COMMIT="latest" -FROM quratorspk/ocrd-galley-core-cuda10.0:$DRONE_COMMIT +ARG GIT_COMMIT="latest" +FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT ARG PIP_INSTALL="pip install --no-cache-dir" ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6" @@ -12,10 +12,6 @@ RUN ${PIP_INSTALL} \ https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz -# Copy OCR models -COPY data/textline_detection /var/lib/textline_detection - - # Check pip dependencies RUN pip check diff --git a/README-DEV.md b/README-DEV.md index fc78890..6fa07e4 100644 --- a/README-DEV.md +++ b/README-DEV.md @@ -3,7 +3,6 @@ How to add a processor * Add model download to `build` (if necessary) * Add a Dockerfile * Add commands to `wrapper/qurator/ocrd_galley/cli.py` -* Add to `.drone.star` Releasing --------- @@ -27,3 +26,12 @@ issue should be open that reminds us to go back to a versioned release again. Other than relying on "proper releases", this also has a second purpose: Review releases of qurator-spk releases. + + +Test builds +----------- +XXX Review this +``` +GIT_COMMIT=test ./build Dockerfile-core Dockerfile-ocrd_tesserocr +DOCKER_IMAGE_TAG=test ./test-ocrd_tesserocr.sh +``` diff --git a/README.md b/README.md index 65e6e3e..e39581b 100644 --- a/README.md +++ b/README.md @@ -29,14 +29,9 @@ including all dependencies in Docker. How to use ---------- -**Currently, due to problems with the Travis CI, we do not provide pre-built -containers anymore.*** - -To build the containers yourself using Docker: -~~~ -cd ~/devel/ocrd-galley/ -./build -~~~ +ocrd-galley uses Docker to run the OCR-D images. We provide pre-built container +images that get downloaded automatically when you run the provided wrappers for +the OCR-D processors. You can then install the wrappers into a Python venv: ~~~ @@ -44,9 +39,13 @@ cd ~/devel/ocrd-galley/wrapper pip install . ~~~ +To download models, you need to use the `-a` flag of `ocrd resmgr`: +~~~ +ocrd resmgr download -a ocrd-calamari-recognize default +~~~ + You may then use the script `my_ocrd_workflow` to use your self-built containers on an example workspace: - ~~~ # Download an example workspace cd /tmp @@ -110,3 +109,11 @@ cd workspace-xxxxx # output by the last command ~~~ This produces a workspace from the files and then runs the OCR workflow on it. + +Build the containers yourself +----------------------------- +To build the containers yourself using Docker: +~~~ +cd ~/devel/ocrd-galley/ +./build +~~~ diff --git a/build b/build index 3c713e5..96835cb 100755 --- a/build +++ b/build @@ -22,38 +22,10 @@ echo "$sub_images" echo - -DATA_SUBDIR=data -get_from_annex() { - annex_get 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200/*.ckpt*' - annex_get 'calamari-models/GT4HistOCR/2019-12-11T11_10+0100/*.ckpt*' - annex_get 'calamari-models/GT4HistOCR/2019-12-18T17_24+0100*/*.ckpt*' - annex_get 'mirror/github.com/Calamari-OCR/calamari_models/gt4histocr/*.ckpt*' - annex_get 'tesseract-models/GT4HistOCR/*.traineddata' - annex_get 'textline_detection/*.h5' - annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' - annex_get 'sbb_binarization/2021-03-09/*.h5' - annex_get 'eynollah/*.h5' -} -get_from_web() { - download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200' - download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz' 'calamari-models/GT4HistOCR/2019-12-11T11_10+0100' - download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR' - download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection' - download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/2021-03-09/models.tar.gz' 'sbb_binarization/2021-03-09' - download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' - download_to --strip-components 1 'https://qurator-data.de/eynollah/models_eynollah.tar.gz' 'eynollah' -} -. $self_dir/qurator_data_lib.sh -handle_data - - - # Update base images if we build a core image if echo "$sub_images" | grep -q core; then - docker pull ubuntu:18.04 - docker pull nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04 - docker pull nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 + docker pull ubuntu:22.04 + docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 fi for sub_image in $sub_images; do diff --git a/build-tmp-XXX b/build-tmp-XXX deleted file mode 100755 index 0072aa3..0000000 --- a/build-tmp-XXX +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -x -set -e - -self=`realpath $0` -self_dir=`dirname "$self"` - -DATA_SUBDIR=data -get_from_web() { - download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200' - download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz' 'calamari-models/GT4HistOCR/2019-12-11T11_10+0100' - download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar' 'tesseract-models/GT4HistOCR' - download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz' 'textline_detection' - download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/2021-03-09/models.tar.gz' 'sbb_binarization/2021-03-09' - download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' - download_to --strip-components 1 'https://qurator-data.de/eynollah/models_eynollah.tar.gz' 'eynollah' -} -. $self_dir/qurator_data_lib.sh -handle_data diff --git a/data b/data deleted file mode 160000 index 9ab08a3..0000000 --- a/data +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9ab08a3626dde1d38dd622b65e425277cd029722 diff --git a/test-core.sh b/test-core.sh new file mode 100755 index 0000000..04467cd --- /dev/null +++ b/test-core.sh @@ -0,0 +1,14 @@ +#!/bin/sh +set -ex + +test_id=`basename $0` +cd `mktemp -d /tmp/$test_id-XXXXX` + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip +unzip actevedef_718448162.first-page+binarization+segmentation.zip +cd actevedef_718448162.first-page+binarization+segmentation + +# Run tests +ocrd workspace validate \ + --page-coordinate-consistency off diff --git a/test-dinglehopper.sh b/test-dinglehopper.sh new file mode 100755 index 0000000..6cdcdf9 --- /dev/null +++ b/test-dinglehopper.sh @@ -0,0 +1,13 @@ +#!/bin/sh +set -ex + +test_id=`basename $0` +cd `mktemp -d /tmp/$test_id-XXXXX` + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip +unzip actevedef_718448162.first-page+binarization+segmentation.zip +cd actevedef_718448162.first-page+binarization+segmentation + +# Run tests +ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-SEG-LINE-SBB -O DINGLEHOPPER-TEST diff --git a/test-ocrd_olena.sh b/test-ocrd_olena.sh new file mode 100755 index 0000000..73c940a --- /dev/null +++ b/test-ocrd_olena.sh @@ -0,0 +1,12 @@ +#!/bin/sh +set -ex + +cd `mktemp -d /tmp/test-ocrd_olena-XXXXX` + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip +unzip actevedef_718448162.first-page+binarization+segmentation.zip +cd actevedef_718448162.first-page+binarization+segmentation + +# Run tests +ocrd-olena-binarize -I OCR-D-IMG -O TEST-OLENA diff --git a/test-ocrd_tesserocr.sh b/test-ocrd_tesserocr.sh new file mode 100755 index 0000000..01dc89b --- /dev/null +++ b/test-ocrd_tesserocr.sh @@ -0,0 +1,17 @@ +#!/bin/sh +set -ex + +cd `mktemp -d /tmp/test-ocrd_tesserocr-XXXXX` + +# Prepare processors +ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip +unzip actevedef_718448162.first-page+binarization+segmentation.zip +cd actevedef_718448162.first-page+binarization+segmentation + +# Run tests +ocrd-tesserocr-segment-region -I OCR-D-IMG-BIN -O TEST-TESS-SEG-REG +ocrd-tesserocr-segment-line -I TEST-TESS-SEG-REG -O TEST-TESS-SEG-LINE +ocrd-tesserocr-recognize -I TEST-TESS-SEG-LINE -O TEST-TESS-OCR -P model Fraktur_GT4HistOCR diff --git a/test-ocrd_trocr.sh b/test-ocrd_trocr.sh new file mode 100755 index 0000000..2c19376 --- /dev/null +++ b/test-ocrd_trocr.sh @@ -0,0 +1,14 @@ +#!/bin/sh +set -ex + +cd `mktemp -d /tmp/test-ocrd_trocr-XXXXX` + +# Prepare processors + +# Prepare test workspace +wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip +unzip actevedef_718448162.first-page+binarization+segmentation.zip +cd actevedef_718448162.first-page+binarization+segmentation + +# Run tests +ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR diff --git a/wrapper/qurator/ocrd_galley/__init__.py b/wrapper/qurator/ocrd_galley/__init__.py index a5bd848..e69de29 100644 --- a/wrapper/qurator/ocrd_galley/__init__.py +++ b/wrapper/qurator/ocrd_galley/__init__.py @@ -1 +0,0 @@ -from .cli import * diff --git a/wrapper/qurator/ocrd_galley/cli.py b/wrapper/qurator/ocrd_galley/cli.py index d6e6ab8..9423c61 100644 --- a/wrapper/qurator/ocrd_galley/cli.py +++ b/wrapper/qurator/ocrd_galley/cli.py @@ -1,61 +1,48 @@ import os import subprocess import sys +import colorama +from pathlib import Path +from termcolor import colored +from .sub_images import sub_images DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley") DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest") LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") +# xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler +# to just roll it on our own. +XDG_CONFIG_HOME = os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config") +XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME", Path.home() / ".local" / "share") +XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache") -sub_images = { - "ocrd": "core", - "ocrd-olena-binarize": "ocrd_olena", - "ocrd-sbb-binarize": "sbb_binarization", - "ocrd-sbb-textline-detector": "sbb_textline_detector", - "ocrd-calamari-recognize": "ocrd_calamari", - "ocrd-calamari-recognize03": "ocrd_calamari03", - "ocrd-tesserocr-segment-region": "ocrd_tesserocr", - "ocrd-tesserocr-segment-line": "ocrd_tesserocr", - "ocrd-tesserocr-recognize": "ocrd_tesserocr", - "ocrd-dinglehopper": "dinglehopper", - "ocrd-cis-ocropy-clip": "ocrd_cis", - "ocrd-cis-ocropy-resegment": "ocrd_cis", - "ocrd-cis-ocropy-segment": "ocrd_cis", - "ocrd-cis-ocropy-deskew": "ocrd_cis", - "ocrd-cis-ocropy-denoise": "ocrd_cis", - "ocrd-cis-ocropy-binarize": "ocrd_cis", - "ocrd-cis-ocropy-dewarp": "ocrd_cis", - "ocrd-cis-ocropy-recognize": "ocrd_cis", - "ocrd-fileformat-transform": "ocrd_fileformat", - "ocrd-segment-extract-pages": "ocrd_segment", - "ocrd-segment-extract-regions": "ocrd_segment", - "ocrd-segment-extract-lines": "ocrd_segment", - "ocrd-segment-from-masks": "ocrd_segment", - "ocrd-segment-from-coco": "ocrd_segment", - "ocrd-segment-repair": "ocrd_segment", - "ocrd-segment-evaluate": "ocrd_segment", - "ocrd-preprocess-image": "ocrd_wrap", - "ocrd-skimage-normalize": "ocrd_wrap", - "ocrd-skimage-denoise-raw": "ocrd_wrap", - "ocrd-skimage-binarize": "ocrd_wrap", - "ocrd-skimage-denoise": "ocrd_wrap", - "ocrd-eynollah-segment": "eynollah", - "ocrd-anybaseocr-crop": "ocrd_anybaseocr", - "ocrd-anybaseocr-deskew": "ocrd_anybaseocr", - - # non OCR-D CLI - "ocr-transform": "ocrd_fileformat", -} - +# ocrd_tesserocr +TESSDATA_PREFIX = XDG_DATA_HOME / "ocrd-resources" / "ocrd-tesserocr-recognize" def main(): + colorama.init() + argv = sys.argv.copy() argv[0] = os.path.basename(argv[0]) - sub_image = sub_images[argv[0]] + # If we're running ocrd resmgr download we need to run the correct subimage. + if argv[:3] == ["ocrd", "resmgr", "download"] or \ + argv[:3] == ["ocrd", "resmgr", "list-available"]: + # Default to the base image + sub_image = sub_images[argv[0]] + # But look for a match of the executable + for x in argv[3:]: + if x in sub_images: + sub_image = sub_images[x] + break + else: + sub_image = sub_images[argv[0]] + docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG) + if DOCKER_IMAGE_TAG != "latest": + print(colored(f"Using {docker_image}", 'red')) docker_run(argv, docker_image) @@ -67,6 +54,29 @@ def docker_run(argv, docker_image): docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL]) docker_run_options.extend(["-e", "_OCRD_COMPLETE"]) + # home directory + docker_run_options.extend(["-e", "HOME=%s" % Path.home()]) + + # .config + docker_run_options.extend(["-e", "XDG_CONFIG_HOME=%s" % XDG_CONFIG_HOME]) + docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" % + (XDG_CONFIG_HOME, XDG_CONFIG_HOME)]) + # .local/share + docker_run_options.extend(["-e", "XDG_DATA_HOME=%s" % XDG_DATA_HOME]) + docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" % + (XDG_DATA_HOME, XDG_DATA_HOME)]) + # .cache + docker_run_options.extend(["-e", "XDG_CACHE_HOME=%s" % XDG_CACHE_HOME]) + docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" % + (XDG_CACHE_HOME, XDG_CACHE_HOME)]) + # .huggingface + os.makedirs(Path.home() / ".huggingface", exist_ok=True) + docker_run_options.extend(["--mount", "type=bind,src=%s,target=%s" % + (Path.home() / ".huggingface", Path("/root") / ".huggingface")]) + + # ocrd_tesserocr + docker_run_options.extend(["-e", "TESSDATA_PREFIX=%s" % TESSDATA_PREFIX]) + # JAVA_TOOL_OPTIONS is used for Java proxy settings if os.environ.get("JAVA_TOOL_OPTIONS"): docker_run_options.extend(["-e", "JAVA_TOOL_OPTIONS"]) diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py new file mode 100644 index 0000000..aaea945 --- /dev/null +++ b/wrapper/qurator/ocrd_galley/sub_images.py @@ -0,0 +1,40 @@ +sub_images = { + "ocrd": "core", + "ocrd-olena-binarize": "ocrd_olena", + "ocrd-sbb-binarize": "sbb_binarization", + "ocrd-sbb-textline-detector": "sbb_textline_detector", + "ocrd-calamari-recognize": "ocrd_calamari", + "ocrd-calamari-recognize03": "ocrd_calamari03", + "ocrd-tesserocr-segment-region": "ocrd_tesserocr", + "ocrd-tesserocr-segment-line": "ocrd_tesserocr", + "ocrd-tesserocr-recognize": "ocrd_tesserocr", + "ocrd-dinglehopper": "dinglehopper", + "ocrd-cis-ocropy-clip": "ocrd_cis", + "ocrd-cis-ocropy-resegment": "ocrd_cis", + "ocrd-cis-ocropy-segment": "ocrd_cis", + "ocrd-cis-ocropy-deskew": "ocrd_cis", + "ocrd-cis-ocropy-denoise": "ocrd_cis", + "ocrd-cis-ocropy-binarize": "ocrd_cis", + "ocrd-cis-ocropy-dewarp": "ocrd_cis", + "ocrd-cis-ocropy-recognize": "ocrd_cis", + "ocrd-fileformat-transform": "ocrd_fileformat", + "ocrd-segment-extract-pages": "ocrd_segment", + "ocrd-segment-extract-regions": "ocrd_segment", + "ocrd-segment-extract-lines": "ocrd_segment", + "ocrd-segment-from-masks": "ocrd_segment", + "ocrd-segment-from-coco": "ocrd_segment", + "ocrd-segment-repair": "ocrd_segment", + "ocrd-segment-evaluate": "ocrd_segment", + "ocrd-preprocess-image": "ocrd_wrap", + "ocrd-skimage-normalize": "ocrd_wrap", + "ocrd-skimage-denoise-raw": "ocrd_wrap", + "ocrd-skimage-binarize": "ocrd_wrap", + "ocrd-skimage-denoise": "ocrd_wrap", + "ocrd-eynollah-segment": "eynollah", + "ocrd-anybaseocr-crop": "ocrd_anybaseocr", + "ocrd-anybaseocr-deskew": "ocrd_anybaseocr", + "ocrd-trocr-recognize": "ocrd_trocr", + + # non OCR-D CLI + "ocr-transform": "ocrd_fileformat", +} diff --git a/wrapper/requirements.txt b/wrapper/requirements.txt new file mode 100644 index 0000000..a994db4 --- /dev/null +++ b/wrapper/requirements.txt @@ -0,0 +1,2 @@ +colorama +termcolor diff --git a/wrapper/setup.py b/wrapper/setup.py index facee4c..3ec8cb9 100644 --- a/wrapper/setup.py +++ b/wrapper/setup.py @@ -1,9 +1,12 @@ from io import open from setuptools import find_packages, setup -from qurator.ocrd_galley.cli import sub_images +from qurator.ocrd_galley.sub_images import sub_images console_scripts = ["%s=qurator.ocrd_galley.cli:main" % command for command in sub_images.keys()] +with open("requirements.txt") as fp: + install_requires = fp.read() + setup( name="ocrd-galley", author="Mike Gerber, The QURATOR SPK Team", @@ -13,6 +16,7 @@ setup( license="Apache", packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), namespace_packages=["qurator"], + install_requires=install_requires, entry_points={ "console_scripts": console_scripts, },