diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6b8710a --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.git diff --git a/Dockerfile-ocrd_trocr b/Dockerfile-ocrd_trocr index fc05759..ef4fe15 100644 --- a/Dockerfile-ocrd_trocr +++ b/Dockerfile-ocrd_trocr @@ -1,8 +1,7 @@ -ARG GIT_COMMIT="latest" -FROM quratorspk/ocrd-galley-core:$GIT_COMMIT +FROM ocrd/all:maximum -ARG PIP_INSTALL="pip install --no-cache-dir" -ARG OCRD_TROCR_COMMIT="250ff1c" +ARG PIP_INSTALL="pip3 install --no-cache-dir" +ARG OCRD_TROCR_COMMIT="30696cb" # Build pip installable stuff diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..977e460 --- /dev/null +++ b/build.sh @@ -0,0 +1,4 @@ +#!/bin/sh +set -ex + +docker build . -t ocrd_trocr:latest -f Dockerfile-ocrd_trocr diff --git a/test-ocrd_trocr.sh b/test-ocrd_trocr.sh index d04be15..68295a9 100755 --- a/test-ocrd_trocr.sh +++ b/test-ocrd_trocr.sh @@ -12,4 +12,6 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip cd actevedef_718448162.first-page+binarization+segmentation # Run tests -ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR +ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR + +# TODO Does not use a useful model, does not check that text was recognize diff --git a/wrapper/qurator/ocrd_galley/cli.py b/wrapper/qurator/ocrd_galley/cli.py index 0263f42..a024a60 100644 --- a/wrapper/qurator/ocrd_galley/cli.py +++ b/wrapper/qurator/ocrd_galley/cli.py @@ -5,8 +5,9 @@ import colorama from pathlib import Path from termcolor import colored +from .processor_images import processor_images + -DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum") # TODO rename LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler @@ -24,9 +25,23 @@ def main(): argv = sys.argv.copy() argv[0] = os.path.basename(argv[0]) - docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, ) - if DOCKER_IMAGE_TAG != "maximum": + # If we're running ocrd resmgr download we need to run the correct subimage. + if argv[:3] == ["ocrd", "resmgr", "download"] or \ + argv[:3] == ["ocrd", "resmgr", "list-available"]: + # Default to the base image + processor_image = processor_images[argv[0]] + # But look for a match of the executable + for x in argv[3:]: + if x in processor_images: + processor_image = processor_images[x] + break + else: + processor_image = processor_images[argv[0]] + + docker_image = processor_image + + if docker_image != "ocrd/all:maximum": print(colored(f"Using {docker_image}", 'red')) docker_run(argv, docker_image) diff --git a/wrapper/qurator/ocrd_galley/processor_images.py b/wrapper/qurator/ocrd_galley/processor_images.py new file mode 100644 index 0000000..da11bd0 --- /dev/null +++ b/wrapper/qurator/ocrd_galley/processor_images.py @@ -0,0 +1,45 @@ +processor_images = { + "ocrd": "ocrd/all:maximum", + "ocrd-olena-binarize": "ocrd/all:maximum", + "ocrd-sbb-binarize": "ocrd/all:maximum", + "ocrd-sbb-textline-detector": "ocrd/all:maximum", + "ocrd-calamari-recognize": "ocrd/all:maximum", + "ocrd-calamari-recognize03": "ocrd/all:maximum", + "ocrd-tesserocr-segment-region": "ocrd/all:maximum", + "ocrd-tesserocr-segment-line": "ocrd/all:maximum", + "ocrd-tesserocr-recognize": "ocrd/all:maximum", + "ocrd-dinglehopper": "ocrd/all:maximum", + "ocrd-cis-ocropy-clip": "ocrd/all:maximum", + "ocrd-cis-ocropy-resegment": "ocrd/all:maximum", + "ocrd-cis-ocropy-segment": "ocrd/all:maximum", + "ocrd-cis-ocropy-deskew": "ocrd/all:maximum", + "ocrd-cis-ocropy-denoise": "ocrd/all:maximum", + "ocrd-cis-ocropy-binarize": "ocrd/all:maximum", + "ocrd-cis-ocropy-dewarp": "ocrd/all:maximum", + "ocrd-cis-ocropy-recognize": "ocrd/all:maximum", + "ocrd-fileformat-transform": "ocrd/all:maximum", + "ocrd-segment-extract-pages": "ocrd/all:maximum", + "ocrd-segment-extract-regions": "ocrd/all:maximum", + "ocrd-segment-extract-lines": "ocrd/all:maximum", + "ocrd-segment-from-masks": "ocrd/all:maximum", + "ocrd-segment-from-coco": "ocrd/all:maximum", + "ocrd-segment-repair": "ocrd/all:maximum", + "ocrd-segment-evaluate": "ocrd/all:maximum", + "ocrd-preprocess-image": "ocrd/all:maximum", + "ocrd-skimage-normalize": "ocrd/all:maximum", + "ocrd-skimage-denoise-raw": "ocrd/all:maximum", + "ocrd-skimage-binarize": "ocrd/all:maximum", + "ocrd-skimage-denoise": "ocrd/all:maximum", + "ocrd-eynollah-segment": "ocrd/all:maximum", + "ocrd-anybaseocr-binarize": "ocrd/all:maximum", + "ocrd-anybaseocr-crop": "ocrd/all:maximum", + "ocrd-anybaseocr-deskew": "ocrd/all:maximum", + + # non OCR-D CLI + "ocr-transform": "ocrd/all:maximum", + "dinglehopper": "ocrd/all:maximum", + "dinglehopper-extract": "ocrd/all:maximum", + + # specialized images + "ocrd-trocr-recognize": "ocrd_trocr", +} diff --git a/wrapper/qurator/ocrd_galley/sub_images.py b/wrapper/qurator/ocrd_galley/sub_images.py deleted file mode 100644 index 560fe0a..0000000 --- a/wrapper/qurator/ocrd_galley/sub_images.py +++ /dev/null @@ -1,44 +0,0 @@ -# TODO is a list now, basically (no more sub images) -sub_images = { - "ocrd": "core", - "ocrd-olena-binarize": "ocrd_olena", - "ocrd-sbb-binarize": "sbb_binarization", - "ocrd-sbb-textline-detector": "sbb_textline_detector", - "ocrd-calamari-recognize": "ocrd_calamari", - "ocrd-calamari-recognize03": "ocrd_calamari03", - "ocrd-tesserocr-segment-region": "ocrd_tesserocr", - "ocrd-tesserocr-segment-line": "ocrd_tesserocr", - "ocrd-tesserocr-recognize": "ocrd_tesserocr", - "ocrd-dinglehopper": "dinglehopper", - "ocrd-cis-ocropy-clip": "ocrd_cis", - "ocrd-cis-ocropy-resegment": "ocrd_cis", - "ocrd-cis-ocropy-segment": "ocrd_cis", - "ocrd-cis-ocropy-deskew": "ocrd_cis", - "ocrd-cis-ocropy-denoise": "ocrd_cis", - "ocrd-cis-ocropy-binarize": "ocrd_cis", - "ocrd-cis-ocropy-dewarp": "ocrd_cis", - "ocrd-cis-ocropy-recognize": "ocrd_cis", - "ocrd-fileformat-transform": "ocrd_fileformat", - "ocrd-segment-extract-pages": "ocrd_segment", - "ocrd-segment-extract-regions": "ocrd_segment", - "ocrd-segment-extract-lines": "ocrd_segment", - "ocrd-segment-from-masks": "ocrd_segment", - "ocrd-segment-from-coco": "ocrd_segment", - "ocrd-segment-repair": "ocrd_segment", - "ocrd-segment-evaluate": "ocrd_segment", - "ocrd-preprocess-image": "ocrd_wrap", - "ocrd-skimage-normalize": "ocrd_wrap", - "ocrd-skimage-denoise-raw": "ocrd_wrap", - "ocrd-skimage-binarize": "ocrd_wrap", - "ocrd-skimage-denoise": "ocrd_wrap", - "ocrd-eynollah-segment": "eynollah", - "ocrd-anybaseocr-binarize": "ocrd_anybaseocr", - "ocrd-anybaseocr-crop": "ocrd_anybaseocr", - "ocrd-anybaseocr-deskew": "ocrd_anybaseocr", - "ocrd-trocr-recognize": "ocrd_trocr", - - # non OCR-D CLI - "ocr-transform": "ocrd_fileformat", - "dinglehopper": "XXX now ocrd_all", - "dinglehopper-extract": "XXX now ocrd_all", -}