🐛 Fix ocrd_trocr (by adding custom image)

1 year ago · b01d2ca6a1
parent 956de7492f
commit b01d2ca6a1
7 changed files with 74 additions and 52 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1 @@
 .git
--- a/7
+++ b/7
@ -1,8 +1,7 @@
-ARG GIT_COMMIT="latest"
+FROM ocrd/all:maximum
 FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
-ARG PIP_INSTALL="pip install --no-cache-dir"
+ARG PIP_INSTALL="pip3 install --no-cache-dir"
-ARG OCRD_TROCR_COMMIT="250ff1c"
+ARG OCRD_TROCR_COMMIT="30696cb"
 # Build pip installable stuff
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,4 @@
 #!/bin/sh
 set -ex
 docker build . -t ocrd_trocr:latest -f Dockerfile-ocrd_trocr
--- a/test-ocrd_trocr.sh
+++ b/test-ocrd_trocr.sh
@ -13,3 +13,5 @@ cd actevedef_718448162.first-page+binarization+segmentation
 # Run tests
 ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR
 # TODO Does not use a useful model, does not check that text was recognize
--- a/wrapper/qurator/ocrd_galley/cli.py
+++ b/wrapper/qurator/ocrd_galley/cli.py
@ -5,8 +5,9 @@ import colorama
 from pathlib import Path
 from termcolor import colored
 from .processor_images import processor_images
 DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum")  # TODO rename
 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
@ -24,9 +25,23 @@ def main():
    argv = sys.argv.copy()
    argv[0] = os.path.basename(argv[0])
    docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, )
-    if DOCKER_IMAGE_TAG != "maximum":
+    # If we're running ocrd resmgr download we need to run the correct subimage.
    if argv[:3] == ["ocrd", "resmgr", "download"] or \
       argv[:3] == ["ocrd", "resmgr", "list-available"]:
        # Default to the base image
        processor_image = processor_images[argv[0]]
        # But look for a match of the executable
        for x in argv[3:]:
            if x in processor_images:
                processor_image = processor_images[x]
                break
    else:
        processor_image = processor_images[argv[0]]
    docker_image = processor_image
    if docker_image != "ocrd/all:maximum":
        print(colored(f"Using {docker_image}", 'red'))
    docker_run(argv, docker_image)
--- a/wrapper/qurator/ocrd_galley/processor_images.py
+++ b/wrapper/qurator/ocrd_galley/processor_images.py
@ -0,0 +1,45 @@
 processor_images = {
        "ocrd": "ocrd/all:maximum",
        "ocrd-olena-binarize": "ocrd/all:maximum",
        "ocrd-sbb-binarize": "ocrd/all:maximum",
        "ocrd-sbb-textline-detector": "ocrd/all:maximum",
        "ocrd-calamari-recognize": "ocrd/all:maximum",
        "ocrd-calamari-recognize03": "ocrd/all:maximum",
        "ocrd-tesserocr-segment-region": "ocrd/all:maximum",
        "ocrd-tesserocr-segment-line": "ocrd/all:maximum",
        "ocrd-tesserocr-recognize": "ocrd/all:maximum",
        "ocrd-dinglehopper": "ocrd/all:maximum",
        "ocrd-cis-ocropy-clip": "ocrd/all:maximum",
        "ocrd-cis-ocropy-resegment": "ocrd/all:maximum",
        "ocrd-cis-ocropy-segment": "ocrd/all:maximum",
        "ocrd-cis-ocropy-deskew": "ocrd/all:maximum",
        "ocrd-cis-ocropy-denoise": "ocrd/all:maximum",
        "ocrd-cis-ocropy-binarize": "ocrd/all:maximum",
        "ocrd-cis-ocropy-dewarp": "ocrd/all:maximum",
        "ocrd-cis-ocropy-recognize": "ocrd/all:maximum",
        "ocrd-fileformat-transform": "ocrd/all:maximum",
        "ocrd-segment-extract-pages": "ocrd/all:maximum",
        "ocrd-segment-extract-regions": "ocrd/all:maximum",
        "ocrd-segment-extract-lines": "ocrd/all:maximum",
        "ocrd-segment-from-masks": "ocrd/all:maximum",
        "ocrd-segment-from-coco": "ocrd/all:maximum",
        "ocrd-segment-repair": "ocrd/all:maximum",
        "ocrd-segment-evaluate": "ocrd/all:maximum",
        "ocrd-preprocess-image": "ocrd/all:maximum",
        "ocrd-skimage-normalize": "ocrd/all:maximum",
        "ocrd-skimage-denoise-raw": "ocrd/all:maximum",
        "ocrd-skimage-binarize": "ocrd/all:maximum",
        "ocrd-skimage-denoise": "ocrd/all:maximum",
        "ocrd-eynollah-segment": "ocrd/all:maximum",
        "ocrd-anybaseocr-binarize": "ocrd/all:maximum",
        "ocrd-anybaseocr-crop": "ocrd/all:maximum",
        "ocrd-anybaseocr-deskew": "ocrd/all:maximum",
        # non OCR-D CLI
        "ocr-transform": "ocrd/all:maximum",
        "dinglehopper": "ocrd/all:maximum",
        "dinglehopper-extract": "ocrd/all:maximum",
        # specialized images
        "ocrd-trocr-recognize": "ocrd_trocr",
 }
--- a/wrapper/qurator/ocrd_galley/sub_images.py
+++ b/wrapper/qurator/ocrd_galley/sub_images.py
@ -1,44 +0,0 @@
 # TODO is a list now, basically (no more sub images)
 sub_images = {
        "ocrd": "core",
        "ocrd-olena-binarize": "ocrd_olena",
        "ocrd-sbb-binarize": "sbb_binarization",
        "ocrd-sbb-textline-detector": "sbb_textline_detector",
        "ocrd-calamari-recognize": "ocrd_calamari",
        "ocrd-calamari-recognize03": "ocrd_calamari03",
        "ocrd-tesserocr-segment-region": "ocrd_tesserocr",
        "ocrd-tesserocr-segment-line": "ocrd_tesserocr",
        "ocrd-tesserocr-recognize": "ocrd_tesserocr",
        "ocrd-dinglehopper": "dinglehopper",
        "ocrd-cis-ocropy-clip": "ocrd_cis",
        "ocrd-cis-ocropy-resegment": "ocrd_cis",
        "ocrd-cis-ocropy-segment": "ocrd_cis",
        "ocrd-cis-ocropy-deskew": "ocrd_cis",
        "ocrd-cis-ocropy-denoise": "ocrd_cis",
        "ocrd-cis-ocropy-binarize": "ocrd_cis",
        "ocrd-cis-ocropy-dewarp": "ocrd_cis",
        "ocrd-cis-ocropy-recognize": "ocrd_cis",
        "ocrd-fileformat-transform": "ocrd_fileformat",
        "ocrd-segment-extract-pages": "ocrd_segment",
        "ocrd-segment-extract-regions": "ocrd_segment",
        "ocrd-segment-extract-lines": "ocrd_segment",
        "ocrd-segment-from-masks": "ocrd_segment",
        "ocrd-segment-from-coco": "ocrd_segment",
        "ocrd-segment-repair": "ocrd_segment",
        "ocrd-segment-evaluate": "ocrd_segment",
        "ocrd-preprocess-image": "ocrd_wrap",
        "ocrd-skimage-normalize": "ocrd_wrap",
        "ocrd-skimage-denoise-raw": "ocrd_wrap",
        "ocrd-skimage-binarize": "ocrd_wrap",
        "ocrd-skimage-denoise": "ocrd_wrap",
        "ocrd-eynollah-segment": "eynollah",
        "ocrd-anybaseocr-binarize": "ocrd_anybaseocr",
        "ocrd-anybaseocr-crop": "ocrd_anybaseocr",
        "ocrd-anybaseocr-deskew": "ocrd_anybaseocr",
        "ocrd-trocr-recognize": "ocrd_trocr",
        # non OCR-D CLI
        "ocr-transform": "ocrd_fileformat",
        "dinglehopper": "XXX now ocrd_all",
        "dinglehopper-extract": "XXX now ocrd_all",
 }