mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-31 11:14:12 +01:00 
			
		
		
		
	✨ Move processors into their own Docker container
This commit is contained in:
		
							parent
							
								
									894cbeee32
								
							
						
					
					
						commit
						02eae7b6fa
					
				
					 11 changed files with 208 additions and 122 deletions
				
			
		
							
								
								
									
										86
									
								
								Dockerfile
									
										
									
									
									
								
							
							
						
						
									
										86
									
								
								Dockerfile
									
										
									
									
									
								
							|  | @ -1,86 +0,0 @@ | ||||||
| FROM ubuntu:18.04 |  | ||||||
| 
 |  | ||||||
| ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 |  | ||||||
| ENV PIP_DEFAULT_TIMEOUT=120 |  | ||||||
| 
 |  | ||||||
| ENV OCRD_OLENA_VERSION 1.2.0 |  | ||||||
| ENV TESSDATA_BEST_VERSION 4.0.0 |  | ||||||
| ENV TESSDATA_PREFIX /usr/local/share/tessdata |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ |  | ||||||
|     apt-get update && \ |  | ||||||
|     apt-get install -y \ |  | ||||||
|       curl xz-utils \ |  | ||||||
|       python3-pip \ |  | ||||||
|       git \ |  | ||||||
|       software-properties-common \ |  | ||||||
| # For clstm on Ubuntu 19.04: |  | ||||||
|       swig libeigen3-dev libpng-dev libprotobuf-dev \ |  | ||||||
| # For cv2: |  | ||||||
|       libsm6 libxrender1 \ |  | ||||||
| # For ocrd_olena: |  | ||||||
|       imagemagick \ |  | ||||||
| # XML utils |  | ||||||
|       libxml2-utils \ |  | ||||||
|       xmlstarlet \ |  | ||||||
|     && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Install Leptonica and Tesseract. |  | ||||||
| RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ |  | ||||||
|     apt-get update && \ |  | ||||||
|     apt-get install -y \ |  | ||||||
|         tesseract-ocr \ |  | ||||||
|         libtesseract-dev \ |  | ||||||
|     && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* |  | ||||||
| 
 |  | ||||||
| # Set up OCR-D logging |  | ||||||
| COPY ocrd_logging.py /etc/ |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Build ocrd_olena |  | ||||||
| # XXX .deb needs an update |  | ||||||
| RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \ |  | ||||||
|     dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \ |  | ||||||
|     rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \ |  | ||||||
|     apt-get update && \ |  | ||||||
|     apt-get -f install -y && \ |  | ||||||
|     apt-get clean && rm -rf /var/lib/apt/lists/* |  | ||||||
| RUN pip3 install --no-cache-dir --upgrade pip && \ |  | ||||||
|    curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ |  | ||||||
|    mkdir ocrd_olena && \ |  | ||||||
|    tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ |  | ||||||
|    cd ocrd_olena && \ |  | ||||||
|    sed -i 's/^install: deps$/install:/' Makefile && \ |  | ||||||
|    pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \ |  | ||||||
|    make install PREFIX=/usr/local && \ |  | ||||||
|    cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Copy OCR models |  | ||||||
| RUN mkdir -p /var/lib/calamari-models |  | ||||||
| COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR |  | ||||||
| RUN mkdir -p $TESSDATA_PREFIX |  | ||||||
| ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ |  | ||||||
| COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ |  | ||||||
| COPY data/textline_detection /var/lib/textline_detection |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Install requirements |  | ||||||
| # Using pipdeptree here to get more info than from pip3 check |  | ||||||
| COPY requirements.txt /tmp/ |  | ||||||
| RUN pip3 install --no-cache-dir --upgrade pip && \ |  | ||||||
|     pip3 install --no-cache-dir --use-feature=2020-resolver -r /tmp/requirements.txt && \ |  | ||||||
|     pip3 install --no-cache-dir pipdeptree && \ |  | ||||||
|     pipdeptree -w fail |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| COPY my_ocrd_workflow /usr/bin/ |  | ||||||
| COPY xsd/*            /usr/share/xml/ |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| WORKDIR /data |  | ||||||
| ENTRYPOINT ["/usr/bin/my_ocrd_workflow"] |  | ||||||
							
								
								
									
										41
									
								
								Dockerfile-boxed-base
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								Dockerfile-boxed-base
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,41 @@ | ||||||
|  | FROM ubuntu:18.04 | ||||||
|  | 
 | ||||||
|  | ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 | ||||||
|  | ENV PIP_DEFAULT_TIMEOUT=120 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | RUN echo "APT::Acquire::Retries \"3\";" > /etc/apt/apt.conf.d/80-retries && \ | ||||||
|  |     apt-get update && \ | ||||||
|  |     apt-get install -y \ | ||||||
|  |       curl xz-utils \ | ||||||
|  |       python3-pip \ | ||||||
|  | # For add-apt-repository: | ||||||
|  |       software-properties-common \ | ||||||
|  | # XML utils | ||||||
|  |       libxml2-utils \ | ||||||
|  |       xmlstarlet \ | ||||||
|  |     && \ | ||||||
|  |     apt-get clean && \ | ||||||
|  |     rm -rf /var/lib/apt/lists/* | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Set up OCR-D logging | ||||||
|  | COPY ocrd_logging.py /etc/ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Build pip installable stuff | ||||||
|  | RUN pip3 install --no-cache-dir --upgrade pip && \ | ||||||
|  |     pip3 install --no-cache-dir \ | ||||||
|  | # Resolve conflicts early: | ||||||
|  |         'setuptools >= 41.0.0' \ | ||||||
|  |         'ocrd >= 2.13.1' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check pip dependencies | ||||||
|  | RUN pip3 check | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | WORKDIR /data | ||||||
|  | 
 | ||||||
|  | # Default command | ||||||
|  | CMD ['ocrd'] | ||||||
							
								
								
									
										18
									
								
								Dockerfile-boxed-dinglehopper
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								Dockerfile-boxed-dinglehopper
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | ||||||
|  | FROM boxed-base | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ENV DINGLEHOPPER_COMMIT 2b98f69 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Build pip installable stuff | ||||||
|  | RUN pip3 install --no-cache-dir \ | ||||||
|  | # Now the real stuff: | ||||||
|  |     https://github.com/qurator-spk/dinglehopper/archive/$DINGLEHOPPER_COMMIT.tar.gz | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check pip dependencies | ||||||
|  | RUN pip3 check | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Default command | ||||||
|  | CMD ["ocrd-dinglehopper"] | ||||||
							
								
								
									
										24
									
								
								Dockerfile-boxed-ocrd_calamari
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								Dockerfile-boxed-ocrd_calamari
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | ||||||
|  | FROM boxed-base | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Build pip installable stuff | ||||||
|  | RUN pip3 install --no-cache-dir \ | ||||||
|  | # Resolve conflicts early: | ||||||
|  |         'tensorflow-gpu == 1.15.*' \ | ||||||
|  |         'calamari-ocr == 0.3.5' \ | ||||||
|  | # Now the real stuff: | ||||||
|  |         'ocrd_calamari >= 0.0.7' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Copy OCR models | ||||||
|  | RUN mkdir -p /var/lib/calamari-models | ||||||
|  | COPY data/calamari-models/GT4HistOCR /var/lib/calamari-models/GT4HistOCR | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check pip dependencies | ||||||
|  | RUN pip3 check | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Default command | ||||||
|  | CMD ["ocrd-calamari-recognize"] | ||||||
							
								
								
									
										33
									
								
								Dockerfile-boxed-ocrd_olena
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								Dockerfile-boxed-ocrd_olena
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | ||||||
|  | FROM boxed-base | ||||||
|  | 
 | ||||||
|  | ENV OCRD_OLENA_VERSION 1.2.0 | ||||||
|  | 
 | ||||||
|  | # Build ocrd_olena | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y \ | ||||||
|  |       imagemagick \ | ||||||
|  |     && \ | ||||||
|  |     apt-get clean && rm -rf /var/lib/apt/lists/* | ||||||
|  | RUN curl -sSL --retry 3 -O https://qurator-data.de/~mike.gerber/olena_2.1-0+ocrd-git/olena-bin_2.1-0+ocrd-git_amd64.deb && \ | ||||||
|  |     dpkg -i --force-depends olena-bin_2.1-0+ocrd-git_amd64.deb && \ | ||||||
|  |     rm -f olena-bin_2.1-0+ocrd-git_amd64.deb && \ | ||||||
|  |     apt-get update && \ | ||||||
|  |     apt-get -f install -y && \ | ||||||
|  |     apt-get clean && rm -rf /var/lib/apt/lists/* | ||||||
|  | RUN pip3 install --no-cache-dir --upgrade pip && \ | ||||||
|  |    curl -sSL --retry 3 -o ocrd_olena.tar.gz https://github.com/OCR-D/ocrd_olena/archive/v${OCRD_OLENA_VERSION}.tar.gz && \ | ||||||
|  |    mkdir ocrd_olena && \ | ||||||
|  |    tar xvz -C ocrd_olena --strip-components=1 -f ocrd_olena.tar.gz && \ | ||||||
|  |    cd ocrd_olena && \ | ||||||
|  |    sed -i 's/^install: deps$/install:/' Makefile && \ | ||||||
|  |    pip3 install --no-cache-dir --use-feature=2020-resolver ocrd && \ | ||||||
|  |    make install PREFIX=/usr/local && \ | ||||||
|  |    cd .. && rm -rf ocrd_olena ocrd_olena.tar.gz | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check pip dependencies | ||||||
|  | RUN pip3 check | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Default command | ||||||
|  | CMD ['ocrd-olena-binarize'] | ||||||
							
								
								
									
										35
									
								
								Dockerfile-boxed-ocrd_tesserocr
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								Dockerfile-boxed-ocrd_tesserocr
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | ||||||
|  | FROM boxed-base | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ENV TESSDATA_BEST_VERSION 4.0.0 | ||||||
|  | ENV TESSDATA_PREFIX /usr/local/share/tessdata | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Install Leptonica and Tesseract. | ||||||
|  | RUN add-apt-repository ppa:alex-p/tesseract-ocr && \ | ||||||
|  |     apt-get update && \ | ||||||
|  |     apt-get install -y \ | ||||||
|  |         tesseract-ocr \ | ||||||
|  |         libtesseract-dev \ | ||||||
|  |     && \ | ||||||
|  |     apt-get clean && rm -rf /var/lib/apt/lists/* | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Copy OCR models | ||||||
|  | RUN mkdir -p $TESSDATA_PREFIX | ||||||
|  | ADD data/mirror/github.com/tesseract-ocr/tessdata_best/archive/${TESSDATA_BEST_VERSION}-repacked.tar.gz $TESSDATA_PREFIX/ | ||||||
|  | COPY data/tesseract-models/GT4HistOCR/GT4HistOCR_2000000.traineddata $TESSDATA_PREFIX/ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Build pip installable stuff | ||||||
|  | RUN pip3 install --no-cache-dir \ | ||||||
|  | # Now the real stuff: | ||||||
|  |     'ocrd_tesserocr >= 0.9.0' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check pip dependencies | ||||||
|  | RUN pip3 check | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Default command | ||||||
|  | CMD ["ocrd-tesserocr-recognize"] | ||||||
							
								
								
									
										22
									
								
								Dockerfile-boxed-sbb_textline_detector
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								Dockerfile-boxed-sbb_textline_detector
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | ||||||
|  | FROM boxed-base | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ENV SBB_TEXTLINE_DETECTOR_COMMIT 8b01d9e | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Build pip installable stuff | ||||||
|  | RUN pip3 install --no-cache-dir \ | ||||||
|  | # Now the real stuff: | ||||||
|  |     https://github.com/qurator-spk/sbb_textline_detector/archive/$SBB_TEXTLINE_DETECTOR_COMMIT.tar.gz | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Copy OCR models | ||||||
|  | COPY data/textline_detection /var/lib/textline_detection | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Check pip dependencies | ||||||
|  | RUN pip3 check | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Default command | ||||||
|  | CMD ["ocrd-sbb-textline-detector"] | ||||||
							
								
								
									
										7
									
								
								build
									
										
									
									
									
								
							
							
						
						
									
										7
									
								
								build
									
										
									
									
									
								
							|  | @ -21,4 +21,9 @@ get_from_web() { | ||||||
| handle_data | handle_data | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| docker build --cache-from my_ocrd_workflow -t my_ocrd_workflow . | docker build -t boxed-base                  -f Dockerfile-boxed-base                  . | ||||||
|  | docker build -t boxed-ocrd_calamari         -f Dockerfile-boxed-ocrd_calamari         . | ||||||
|  | docker build -t boxed-dinglehopper          -f Dockerfile-boxed-dinglehopper          . | ||||||
|  | docker build -t boxed-ocrd_olena            -f Dockerfile-boxed-ocrd_olena            . | ||||||
|  | docker build -t boxed-ocrd_tesserocr        -f Dockerfile-boxed-ocrd_tesserocr        . | ||||||
|  | docker build -t boxed-sbb_textline_detector -f Dockerfile-boxed-sbb_textline_detector . | ||||||
|  |  | ||||||
|  | @ -1,13 +0,0 @@ | ||||||
| tensorflow-gpu < 2.0  # Needed for sbb_text_linedetector |  | ||||||
| 
 |  | ||||||
| ocrd >= 2.13.1 |  | ||||||
| 
 |  | ||||||
| # XXX See https://github.com/OCR-D/ocrd_tesserocr/issues/135 |  | ||||||
| # ocrd_tesserocr >= 0.8.XXX |  | ||||||
| https://github.com/mikegerber/ocrd_tesserocr/archive/fix/set-pcgtsid.tar.gz |  | ||||||
| 
 |  | ||||||
| ocrd_calamari >= 0.0.7 |  | ||||||
| 
 |  | ||||||
| https://github.com/qurator-spk/sbb_textline_detector/archive/8b01d9e.tar.gz |  | ||||||
| 
 |  | ||||||
| https://github.com/qurator-spk/dinglehopper/archive/2b98f69.tar.gz |  | ||||||
							
								
								
									
										47
									
								
								run
									
										
									
									
									
								
							
							
						
						
									
										47
									
								
								run
									
										
									
									
									
								
							|  | @ -1,31 +1,42 @@ | ||||||
| #!/bin/sh | #!/bin/bash | ||||||
| # Run the my_ocrd_workflow container on the current workspace |  | ||||||
| 
 | 
 | ||||||
| set -e  # Abort on error | set -e  # Abort on error | ||||||
| 
 | 
 | ||||||
| DOCKER_IMAGE=${DOCKER_IMAGE:-my_ocrd_workflow:latest}  # default to locally built | self=`realpath $0` | ||||||
| 
 | self_dir=`dirname "$self"` | ||||||
| if echo "$DOCKER_IMAGE" | grep -q "/"; then |  | ||||||
|   docker pull "$DOCKER_IMAGE" |  | ||||||
| fi |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # XXX Work around podman vs docker uid behaviour | # Docker run options | ||||||
|  | docker_run_options="--rm -t" | ||||||
|  | docker_run_options="$docker_run_options --mount type=bind,src=\"$(pwd)\",target=/data" | ||||||
|  | # In podman, the container always runs as the real user == uid 0 in container | ||||||
| if docker -v 2>&1 | grep -q podman; then | if docker -v 2>&1 | grep -q podman; then | ||||||
|   user="0:0" |   user="0:0" | ||||||
| else | else | ||||||
|   user="`id -u`:`id -g`" |   user="`id -u`:`id -g`" | ||||||
| fi | fi | ||||||
| 
 | docker_run_options="$docker_run_options --user $user" | ||||||
| 
 | docker_run_options="$docker_run_options -e LOG_LEVEL=$LOG_LEVEL" | ||||||
| # The container currently needs to run privileged to allow it to read from e.g. | # The containers currently need to run privileged to allow it to read from e.g. | ||||||
| # /home on SELinux secured systems such as Fedora. We might want to use udica | # /home on SELinux secured systems such as Fedora. We might want to use udica | ||||||
| # instead in the future. | # instead in the future. | ||||||
|  | docker_run_options="$docker_run_options --privileged=true" | ||||||
| 
 | 
 | ||||||
| docker run --privileged=true --rm -t \ | 
 | ||||||
|   \ | # Build aliases for the containerized ocrd processors | ||||||
|   --user $user \ | build_alias() { | ||||||
|   --mount type=bind,src="$(pwd)",target=/data \ |   local command=$1 | ||||||
|   \ |   local docker_image=$2 | ||||||
|   -e LOG_LEVEL=$LOG_LEVEL \ | 
 | ||||||
|   $DOCKER_IMAGE "$@" |   alias $command="docker run $docker_run_options $docker_image $command" | ||||||
|  | } | ||||||
|  | shopt -s expand_aliases  # Required for non-interactive shells | ||||||
|  | build_alias ocrd                       boxed-base | ||||||
|  | build_alias ocrd-olena-binarize        boxed-ocrd_olena | ||||||
|  | build_alias ocrd-sbb-textline-detector boxed-sbb_textline_detector | ||||||
|  | build_alias ocrd-calamari-recognize    boxed-ocrd_calamari | ||||||
|  | build_alias ocrd-tesserocr-recognize   boxed-ocrd_tesserocr | ||||||
|  | build_alias ocrd-dinglehopper          boxed-dinglehopper | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | . $self_dir/my_ocrd_workflow | ||||||
|  |  | ||||||
|  | @ -1,4 +0,0 @@ | ||||||
| #!/bin/sh |  | ||||||
| # Run the my_ocrd_workflow container on the current workspace |  | ||||||
| 
 |  | ||||||
| DOCKER_IMAGE=mikegerber/my_ocrd_workflow:stable `dirname $0`/run "$@" |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue