mirror of
https://github.com/qurator-spk/ocrd-galley.git
synced 2025-07-01 17:39:54 +02:00
🐛 Fix ocrd_trocr (by adding custom image)
This commit is contained in:
parent
956de7492f
commit
b01d2ca6a1
7 changed files with 74 additions and 52 deletions
1
.dockerignore
Normal file
1
.dockerignore
Normal file
|
@ -0,0 +1 @@
|
||||||
|
.git
|
|
@ -1,8 +1,7 @@
|
||||||
ARG GIT_COMMIT="latest"
|
FROM ocrd/all:maximum
|
||||||
FROM quratorspk/ocrd-galley-core:$GIT_COMMIT
|
|
||||||
|
|
||||||
ARG PIP_INSTALL="pip install --no-cache-dir"
|
ARG PIP_INSTALL="pip3 install --no-cache-dir"
|
||||||
ARG OCRD_TROCR_COMMIT="250ff1c"
|
ARG OCRD_TROCR_COMMIT="30696cb"
|
||||||
|
|
||||||
|
|
||||||
# Build pip installable stuff
|
# Build pip installable stuff
|
||||||
|
|
4
build.sh
Executable file
4
build.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
docker build . -t ocrd_trocr:latest -f Dockerfile-ocrd_trocr
|
|
@ -13,3 +13,5 @@ cd actevedef_718448162.first-page+binarization+segmentation
|
||||||
|
|
||||||
# Run tests
|
# Run tests
|
||||||
ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR
|
ocrd-trocr-recognize -I OCR-D-SEG-LINE-SBB -O TEST-TROCR
|
||||||
|
|
||||||
|
# TODO Does not use a useful model, does not check that text was recognize
|
||||||
|
|
|
@ -5,8 +5,9 @@ import colorama
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from termcolor import colored
|
from termcolor import colored
|
||||||
|
|
||||||
|
from .processor_images import processor_images
|
||||||
|
|
||||||
|
|
||||||
DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum") # TODO rename
|
|
||||||
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
||||||
|
|
||||||
# xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
|
# xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler
|
||||||
|
@ -24,9 +25,23 @@ def main():
|
||||||
argv = sys.argv.copy()
|
argv = sys.argv.copy()
|
||||||
argv[0] = os.path.basename(argv[0])
|
argv[0] = os.path.basename(argv[0])
|
||||||
|
|
||||||
docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, )
|
|
||||||
|
|
||||||
if DOCKER_IMAGE_TAG != "maximum":
|
# If we're running ocrd resmgr download we need to run the correct subimage.
|
||||||
|
if argv[:3] == ["ocrd", "resmgr", "download"] or \
|
||||||
|
argv[:3] == ["ocrd", "resmgr", "list-available"]:
|
||||||
|
# Default to the base image
|
||||||
|
processor_image = processor_images[argv[0]]
|
||||||
|
# But look for a match of the executable
|
||||||
|
for x in argv[3:]:
|
||||||
|
if x in processor_images:
|
||||||
|
processor_image = processor_images[x]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
processor_image = processor_images[argv[0]]
|
||||||
|
|
||||||
|
docker_image = processor_image
|
||||||
|
|
||||||
|
if docker_image != "ocrd/all:maximum":
|
||||||
print(colored(f"Using {docker_image}", 'red'))
|
print(colored(f"Using {docker_image}", 'red'))
|
||||||
docker_run(argv, docker_image)
|
docker_run(argv, docker_image)
|
||||||
|
|
||||||
|
|
45
wrapper/qurator/ocrd_galley/processor_images.py
Normal file
45
wrapper/qurator/ocrd_galley/processor_images.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
processor_images = {
|
||||||
|
"ocrd": "ocrd/all:maximum",
|
||||||
|
"ocrd-olena-binarize": "ocrd/all:maximum",
|
||||||
|
"ocrd-sbb-binarize": "ocrd/all:maximum",
|
||||||
|
"ocrd-sbb-textline-detector": "ocrd/all:maximum",
|
||||||
|
"ocrd-calamari-recognize": "ocrd/all:maximum",
|
||||||
|
"ocrd-calamari-recognize03": "ocrd/all:maximum",
|
||||||
|
"ocrd-tesserocr-segment-region": "ocrd/all:maximum",
|
||||||
|
"ocrd-tesserocr-segment-line": "ocrd/all:maximum",
|
||||||
|
"ocrd-tesserocr-recognize": "ocrd/all:maximum",
|
||||||
|
"ocrd-dinglehopper": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-clip": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-resegment": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-segment": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-deskew": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-denoise": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-binarize": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-dewarp": "ocrd/all:maximum",
|
||||||
|
"ocrd-cis-ocropy-recognize": "ocrd/all:maximum",
|
||||||
|
"ocrd-fileformat-transform": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-extract-pages": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-extract-regions": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-extract-lines": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-from-masks": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-from-coco": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-repair": "ocrd/all:maximum",
|
||||||
|
"ocrd-segment-evaluate": "ocrd/all:maximum",
|
||||||
|
"ocrd-preprocess-image": "ocrd/all:maximum",
|
||||||
|
"ocrd-skimage-normalize": "ocrd/all:maximum",
|
||||||
|
"ocrd-skimage-denoise-raw": "ocrd/all:maximum",
|
||||||
|
"ocrd-skimage-binarize": "ocrd/all:maximum",
|
||||||
|
"ocrd-skimage-denoise": "ocrd/all:maximum",
|
||||||
|
"ocrd-eynollah-segment": "ocrd/all:maximum",
|
||||||
|
"ocrd-anybaseocr-binarize": "ocrd/all:maximum",
|
||||||
|
"ocrd-anybaseocr-crop": "ocrd/all:maximum",
|
||||||
|
"ocrd-anybaseocr-deskew": "ocrd/all:maximum",
|
||||||
|
|
||||||
|
# non OCR-D CLI
|
||||||
|
"ocr-transform": "ocrd/all:maximum",
|
||||||
|
"dinglehopper": "ocrd/all:maximum",
|
||||||
|
"dinglehopper-extract": "ocrd/all:maximum",
|
||||||
|
|
||||||
|
# specialized images
|
||||||
|
"ocrd-trocr-recognize": "ocrd_trocr",
|
||||||
|
}
|
|
@ -1,44 +0,0 @@
|
||||||
# TODO is a list now, basically (no more sub images)
|
|
||||||
sub_images = {
|
|
||||||
"ocrd": "core",
|
|
||||||
"ocrd-olena-binarize": "ocrd_olena",
|
|
||||||
"ocrd-sbb-binarize": "sbb_binarization",
|
|
||||||
"ocrd-sbb-textline-detector": "sbb_textline_detector",
|
|
||||||
"ocrd-calamari-recognize": "ocrd_calamari",
|
|
||||||
"ocrd-calamari-recognize03": "ocrd_calamari03",
|
|
||||||
"ocrd-tesserocr-segment-region": "ocrd_tesserocr",
|
|
||||||
"ocrd-tesserocr-segment-line": "ocrd_tesserocr",
|
|
||||||
"ocrd-tesserocr-recognize": "ocrd_tesserocr",
|
|
||||||
"ocrd-dinglehopper": "dinglehopper",
|
|
||||||
"ocrd-cis-ocropy-clip": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-resegment": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-segment": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-deskew": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-denoise": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-binarize": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-dewarp": "ocrd_cis",
|
|
||||||
"ocrd-cis-ocropy-recognize": "ocrd_cis",
|
|
||||||
"ocrd-fileformat-transform": "ocrd_fileformat",
|
|
||||||
"ocrd-segment-extract-pages": "ocrd_segment",
|
|
||||||
"ocrd-segment-extract-regions": "ocrd_segment",
|
|
||||||
"ocrd-segment-extract-lines": "ocrd_segment",
|
|
||||||
"ocrd-segment-from-masks": "ocrd_segment",
|
|
||||||
"ocrd-segment-from-coco": "ocrd_segment",
|
|
||||||
"ocrd-segment-repair": "ocrd_segment",
|
|
||||||
"ocrd-segment-evaluate": "ocrd_segment",
|
|
||||||
"ocrd-preprocess-image": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-normalize": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-denoise-raw": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-binarize": "ocrd_wrap",
|
|
||||||
"ocrd-skimage-denoise": "ocrd_wrap",
|
|
||||||
"ocrd-eynollah-segment": "eynollah",
|
|
||||||
"ocrd-anybaseocr-binarize": "ocrd_anybaseocr",
|
|
||||||
"ocrd-anybaseocr-crop": "ocrd_anybaseocr",
|
|
||||||
"ocrd-anybaseocr-deskew": "ocrd_anybaseocr",
|
|
||||||
"ocrd-trocr-recognize": "ocrd_trocr",
|
|
||||||
|
|
||||||
# non OCR-D CLI
|
|
||||||
"ocr-transform": "ocrd_fileformat",
|
|
||||||
"dinglehopper": "XXX now ocrd_all",
|
|
||||||
"dinglehopper-extract": "XXX now ocrd_all",
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue