mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-30 18:54:14 +01:00 
			
		
		
		
	🚧 WIP: Migrate to using ocrd:all image + Update tests
This commit is contained in:
		
							parent
							
								
									fc911f3734
								
							
						
					
					
						commit
						699023c084
					
				
					 15 changed files with 9 additions and 326 deletions
				
			
		|  | @ -1,70 +0,0 @@ | ||||||
| FROM ubuntu:22.04 |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG OCRD_VERSION_MINIMUM="2.47.0" |  | ||||||
| ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 |  | ||||||
| ENV PIP_DEFAULT_TIMEOUT=120 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ |  | ||||||
|     apt-get update && \ |  | ||||||
|     apt-get install -y \ |  | ||||||
|       build-essential \ |  | ||||||
|       curl \ |  | ||||||
|       git \ |  | ||||||
|       xz-utils \ |  | ||||||
|       pkg-config \ |  | ||||||
| # For add-apt-repository: |  | ||||||
|       software-properties-common \ |  | ||||||
| # XML utils |  | ||||||
|       libxml2-utils \ |  | ||||||
|       xmlstarlet \ |  | ||||||
| # OCR-D uses ImageMagick for pixel density estimation |  | ||||||
|       imagemagick \ |  | ||||||
| # pyenv builds |  | ||||||
| # TODO: builder container? |  | ||||||
|       libz-dev \ |  | ||||||
|       libssl-dev \ |  | ||||||
|       libbz2-dev \ |  | ||||||
|       liblzma-dev \ |  | ||||||
|       libncurses-dev \ |  | ||||||
|       libffi-dev \ |  | ||||||
|       libreadline-dev \ |  | ||||||
|       libsqlite3-dev \ |  | ||||||
|       libmagic-dev \ |  | ||||||
|     && \ |  | ||||||
|     apt-get clean && \ |  | ||||||
|     rm -rf /var/lib/apt/lists/* |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Set up OCR-D logging |  | ||||||
| RUN echo "setOverrideLogLevel(os.getenv('LOG_LEVEL', 'INFO'))" >/etc/ocrd_logging.py |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Install pyenv |  | ||||||
| # TODO: do not run as root |  | ||||||
| # TODO: does just saying "3.7" work as intended? |  | ||||||
| ENV HOME=/root |  | ||||||
| ENV PYENV_ROOT=/usr/local/share/pyenv |  | ||||||
| ENV PATH=$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH |  | ||||||
| RUN \ |  | ||||||
|     git clone --depth=1 https://github.com/yyuu/pyenv.git $PYENV_ROOT && \ |  | ||||||
|     pyenv install 3.7 && \ |  | ||||||
|     pyenv global 3.7 && \ |  | ||||||
|     pyenv rehash && \ |  | ||||||
|     pip install -U pip wheel && \ |  | ||||||
|     pip install setuptools |  | ||||||
| 
 |  | ||||||
| # Install pip installable-stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
|         "ocrd >= ${OCRD_VERSION_MINIMUM}" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| WORKDIR /data |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ['ocrd'] |  | ||||||
|  | @ -1,18 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG DINGLEHOPPER_VERSION="0.9.2" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build pip installable stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
|         "dinglehopper == $DINGLEHOPPER_VERSION" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ["ocrd-dinglehopper"] |  | ||||||
|  | @ -1,18 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG EYNOLLAH_VERSION="0.3.0" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build pip installable stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
|     "eynollah == ${EYNOLLAH_VERSION}" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ["ocrd-eynollah-segment"] |  | ||||||
|  | @ -1,24 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG OCRD_FILEFORMAT_VERSION="0.5.0" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| RUN apt-get update && \ |  | ||||||
|     apt-get install -y \ |  | ||||||
|       git \ |  | ||||||
|       openjdk-11-jdk-headless \ |  | ||||||
|       wget \ |  | ||||||
|       unzip \ |  | ||||||
|     && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* |  | ||||||
| RUN git clone --depth 1 --branch v${OCRD_FILEFORMAT_VERSION} https://github.com/OCR-D/ocrd_fileformat.git && \ |  | ||||||
|     cd ocrd_fileformat/ && \ |  | ||||||
|     git submodule update --init && \ |  | ||||||
|     make install-fileformat install PREFIX=/usr/local && \ |  | ||||||
|     cd .. && rm -rf ocrd_fileformat/ |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ['ocrd-fileformat-transform'] |  | ||||||
|  | @ -1,39 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG OCRD_OLENA_VERSION="1.3.0" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build ocrd_olena |  | ||||||
| RUN apt-get update && \ |  | ||||||
|     apt-get install -y \ |  | ||||||
|       imagemagick \ |  | ||||||
|     && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* |  | ||||||
| 
 |  | ||||||
| # Install olena from .deb |  | ||||||
| RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1.0+ocrd-git+2-ubuntu22.04/olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ |  | ||||||
|     dpkg -i --force-depends olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ |  | ||||||
|     rm -f olena-bin_2.1.0+ocrd-git+2_amd64.deb && \ |  | ||||||
|     apt-get update && \ |  | ||||||
|     apt-get -f install -y && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* && \ |  | ||||||
|     if ! scribo-cli sauvola --help >/dev/null 2>&1; then echo "Olena/scribo is not installed correctly" >&2; exit 1; fi |  | ||||||
| RUN curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ |  | ||||||
|    mkdir ocrd_olena && \ |  | ||||||
|    tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ |  | ||||||
|    cd ocrd_olena && \ |  | ||||||
|    sed -i 's/^install: deps/install:/' Makefile && \ |  | ||||||
|    ${PIP_INSTALL} ocrd && \ |  | ||||||
|    apt install xmlstarlet && \ |  | ||||||
|    make install PREFIX=/usr/local && \ |  | ||||||
|    cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ['ocrd-olena-binarize'] |  | ||||||
|  | @ -1,19 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG OCRD_SEGMENT_VERSION="0.1.22" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build pip installable stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
| # Now the real stuff: |  | ||||||
|         "ocrd-segment == ${OCRD_SEGMENT_VERSION}" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ["ocrd-segment-extract-regions"] |  | ||||||
|  | @ -1,31 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG TESSDATA_BEST_VERSION="4.0.0" |  | ||||||
| ARG OCRD_TESSEROCR_VERSION="0.17.0" |  | ||||||
| ENV TESSDATA_PREFIX /usr/local/share/tessdata |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Install Leptonica and Tesseract. |  | ||||||
| # TODO: Review if alex-p's repo is still necessary on jammy (jammy has 4.1.1, |  | ||||||
| # alex-p has 4.1.3, but not for jammy.) |  | ||||||
| # RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ |  | ||||||
| RUN apt-get update && \ |  | ||||||
|     apt-get install -y \ |  | ||||||
|         tesseract-ocr \ |  | ||||||
|         libtesseract-dev \ |  | ||||||
|     && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build pip installable stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
|     "ocrd_tesserocr == ${OCRD_TESSEROCR_VERSION}" |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ["ocrd-tesserocr-recognize"] |  | ||||||
|  | @ -1,18 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG OCRD_WRAP_VERSION="0.1.7" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build pip installable stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
|     "ocrd_wrap == ${OCRD_WRAP_VERSION}" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ["ocrd-preprocess-image"] |  | ||||||
|  | @ -1,20 +0,0 @@ | ||||||
| ARG GIT_COMMIT="latest" |  | ||||||
| FROM quratorspk/ocrd-galley-core-cuda12.1:$GIT_COMMIT |  | ||||||
| 
 |  | ||||||
| ARG PIP_INSTALL="pip install --no-cache-dir" |  | ||||||
| ARG SBB_TEXTLINE_DETECTOR_COMMIT="c4df3d6" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build pip installable stuff |  | ||||||
| RUN ${PIP_INSTALL} \ |  | ||||||
|     # https://github.com/qurator-spk/sbb_textline_detection/issues/50 |  | ||||||
|     "h5py < 3" \ |  | ||||||
|     https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Check pip dependencies |  | ||||||
| RUN pip check |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Default command |  | ||||||
| CMD ["ocrd-sbb-textline-detector"] |  | ||||||
							
								
								
									
										33
									
								
								build
									
										
									
									
									
								
							
							
						
						
									
										33
									
								
								build
									
										
									
									
									
								
							|  | @ -1,33 +0,0 @@ | ||||||
| #!/bin/bash |  | ||||||
| set -e |  | ||||||
| 
 |  | ||||||
| self=`realpath $0` |  | ||||||
| self_dir=`dirname "$self"` |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if [ -n "$1" ]; then |  | ||||||
|   sub_images="" |  | ||||||
|   for arg in "$@"; do |  | ||||||
|     arg_sub_image=`echo "$arg" | sed 's/Dockerfile-//'` |  | ||||||
|     NL=$'\n' |  | ||||||
|     sub_images+="$NL$arg_sub_image" |  | ||||||
|   done |  | ||||||
| else |  | ||||||
|   sub_images=`ls -1 Dockerfile-core* | sed 's/Dockerfile-//'` |  | ||||||
|   sub_images="$sub_images `ls -1 Dockerfile-* | sed 's/Dockerfile-//'`" |  | ||||||
| fi |  | ||||||
| echo "Building:" |  | ||||||
| echo "$sub_images" |  | ||||||
| echo |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Update base images if we build a core image |  | ||||||
| if echo "$sub_images" | grep -q core; then |  | ||||||
|   docker pull ubuntu:22.04 |  | ||||||
|   docker pull nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 |  | ||||||
| fi |  | ||||||
| 
 |  | ||||||
| for sub_image in $sub_images; do |  | ||||||
|   docker build --cache-from=quratorspk/ocrd-galley-$sub_image -t quratorspk/ocrd-galley-$sub_image -f Dockerfile-$sub_image . |  | ||||||
| done |  | ||||||
|  | @ -5,6 +5,8 @@ test_id=`basename $0` | ||||||
| cd `mktemp -d /tmp/$test_id-XXXXX` | cd `mktemp -d /tmp/$test_id-XXXXX` | ||||||
| 
 | 
 | ||||||
| # Prepare processors | # Prepare processors | ||||||
|  | ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata | ||||||
|  | ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata | ||||||
| ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata | ocrd resmgr download ocrd-tesserocr-recognize Fraktur_GT4HistOCR.traineddata | ||||||
| 
 | 
 | ||||||
| # Prepare test workspace | # Prepare test workspace | ||||||
|  |  | ||||||
|  | @ -5,7 +5,7 @@ test_id=`basename $0` | ||||||
| cd `mktemp -d /tmp/$test_id-XXXXX` | cd `mktemp -d /tmp/$test_id-XXXXX` | ||||||
| 
 | 
 | ||||||
| # Prepare processors | # Prepare processors | ||||||
| ocrd resmgr download ocrd-sbb-binarize default-2021-03-09 | ocrd resmgr download ocrd-sbb-binarize default | ||||||
| 
 | 
 | ||||||
| # Prepare test workspace | # Prepare test workspace | ||||||
| wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip | wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip | ||||||
|  | @ -13,4 +13,4 @@ unzip actevedef_718448162.first-page+binarization+segmentation.zip | ||||||
| cd actevedef_718448162.first-page+binarization+segmentation | cd actevedef_718448162.first-page+binarization+segmentation | ||||||
| 
 | 
 | ||||||
| # Run tests | # Run tests | ||||||
| ocrd-sbb-binarize -P model default-2021-03-09 -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE | ocrd-sbb-binarize -P model default -I OCR-D-IMG -O TEST-OCRD-SBB-BINARIZE | ||||||
|  |  | ||||||
|  | @ -1,16 +0,0 @@ | ||||||
| #!/bin/sh |  | ||||||
| set -ex |  | ||||||
| 
 |  | ||||||
| test_id=`basename $0` |  | ||||||
| cd `mktemp -d /tmp/$test_id-XXXXX` |  | ||||||
| 
 |  | ||||||
| # Prepare processors |  | ||||||
| ocrd resmgr download ocrd-sbb-textline-detector default |  | ||||||
| 
 |  | ||||||
| # Prepare test workspace |  | ||||||
| wget https://qurator-data.de/examples/actevedef_718448162.first-page+binarization+segmentation.zip |  | ||||||
| unzip actevedef_718448162.first-page+binarization+segmentation.zip |  | ||||||
| cd actevedef_718448162.first-page+binarization+segmentation |  | ||||||
| 
 |  | ||||||
| # Run tests |  | ||||||
| ocrd-sbb-textline-detector -P models default -I OCR-D-IMG-BIN -O TEST-EYNOLLAH-SEG |  | ||||||
|  | @ -5,10 +5,8 @@ import colorama | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from termcolor import colored | from termcolor import colored | ||||||
| 
 | 
 | ||||||
| from .sub_images import sub_images |  | ||||||
| 
 | 
 | ||||||
| DOCKER_IMAGE_PREFIX = os.environ.get("DOCKER_IMAGE_PREFIX", "quratorspk/ocrd-galley") | DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "maximum")  # TODO rename | ||||||
| DOCKER_IMAGE_TAG = os.environ.get("DOCKER_IMAGE_TAG", "latest") |  | ||||||
| LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") | LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") | ||||||
| 
 | 
 | ||||||
| # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler | # xdg-user-dirs is only available under Python 3.10+ etc. pp. → it is simpler | ||||||
|  | @ -26,22 +24,9 @@ def main(): | ||||||
|     argv = sys.argv.copy() |     argv = sys.argv.copy() | ||||||
|     argv[0] = os.path.basename(argv[0]) |     argv[0] = os.path.basename(argv[0]) | ||||||
| 
 | 
 | ||||||
|     # If we're running ocrd resmgr download we need to run the correct subimage. |     docker_image = "ocrd/all:%s" % (DOCKER_IMAGE_TAG, ) | ||||||
|     if argv[:3] == ["ocrd", "resmgr", "download"] or \ |  | ||||||
|        argv[:3] == ["ocrd", "resmgr", "list-available"]: |  | ||||||
|         # Default to the base image |  | ||||||
|         sub_image = sub_images[argv[0]] |  | ||||||
|         # But look for a match of the executable |  | ||||||
|         for x in argv[3:]: |  | ||||||
|             if x in sub_images: |  | ||||||
|                 sub_image = sub_images[x] |  | ||||||
|                 break |  | ||||||
|     else: |  | ||||||
|         sub_image = sub_images[argv[0]] |  | ||||||
| 
 | 
 | ||||||
|     docker_image = "%s-%s:%s" % (DOCKER_IMAGE_PREFIX, sub_image, DOCKER_IMAGE_TAG) |     if DOCKER_IMAGE_TAG != "maximum": | ||||||
| 
 |  | ||||||
|     if DOCKER_IMAGE_TAG != "latest": |  | ||||||
|         print(colored(f"Using {docker_image}", 'red')) |         print(colored(f"Using {docker_image}", 'red')) | ||||||
|     docker_run(argv, docker_image) |     docker_run(argv, docker_image) | ||||||
| 
 | 
 | ||||||
|  | @ -50,6 +35,7 @@ def docker_run(argv, docker_image): | ||||||
|     docker_run_options = [] |     docker_run_options = [] | ||||||
|     docker_run_options.extend(["--rm", "-t"]) |     docker_run_options.extend(["--rm", "-t"]) | ||||||
|     docker_run_options.extend(["--mount", "type=bind,src=%s,target=/data" % os.getcwd()]) |     docker_run_options.extend(["--mount", "type=bind,src=%s,target=/data" % os.getcwd()]) | ||||||
|  |     docker_run_options.extend(["--mount", "type=tmpfs,target=/tmp"]) | ||||||
|     docker_run_options.extend(["--user", "%s:%s" % (os.getuid(), os.getgid())]) |     docker_run_options.extend(["--user", "%s:%s" % (os.getuid(), os.getgid())]) | ||||||
|     docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL]) |     docker_run_options.extend(["-e", "LOG_LEVEL=%s" % LOG_LEVEL]) | ||||||
|     docker_run_options.extend(["-e", "_OCRD_COMPLETE"]) |     docker_run_options.extend(["-e", "_OCRD_COMPLETE"]) | ||||||
|  |  | ||||||
|  | @ -1,3 +1,4 @@ | ||||||
|  | # TODO is a list now, basically (no more sub images) | ||||||
| sub_images = { | sub_images = { | ||||||
|         "ocrd": "core", |         "ocrd": "core", | ||||||
|         "ocrd-olena-binarize": "ocrd_olena", |         "ocrd-olena-binarize": "ocrd_olena", | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue