mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-25 09:24:13 +02:00 
			
		
		
		
	🚧 Prepare supporting ocrd-sbb-binarize
ocrd-sbb-binarize seems to work but its input does not work with ocrd-sbb-textline-detector: https://github.com/qurator-spk/sbb_binarization/issues/8 https://github.com/qurator-spk/sbb_textline_detection/issues/47
This commit is contained in:
		
							parent
							
								
									053fc0bc34
								
							
						
					
					
						commit
						0841af5491
					
				
					 6 changed files with 57 additions and 6 deletions
				
			
		
							
								
								
									
										22
									
								
								Dockerfile-sbb_binarization
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								Dockerfile-sbb_binarization
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| FROM my_ocrd_workflow-core | ||||
| 
 | ||||
| ARG PIP_INSTALL="pip3 install --no-cache-dir --use-feature=2020-resolver" | ||||
| ARG SBB_BINARIZATION_COMMIT="3e60a62" | ||||
| 
 | ||||
| 
 | ||||
| # Build pip installable stuff | ||||
| RUN ${PIP_INSTALL} \ | ||||
| # Now the real stuff: | ||||
|     https://github.com/qurator-spk/sbb_binarization/archive/$SBB_BINARIZATION_COMMIT.tar.gz | ||||
| 
 | ||||
| 
 | ||||
| # Copy models | ||||
| COPY data/sbb_binarization /var/lib/sbb_binarization | ||||
| 
 | ||||
| 
 | ||||
| # Check pip dependencies | ||||
| RUN pip3 check | ||||
| 
 | ||||
| 
 | ||||
| # Default command | ||||
| CMD ["ocrd-sbb-binarize"] | ||||
							
								
								
									
										3
									
								
								build
									
										
									
									
									
								
							
							
						
						
									
										3
									
								
								build
									
										
									
									
									
								
							|  | @ -10,11 +10,13 @@ get_from_annex() { | |||
|   annex_get 'tesseract-models/GT4HistOCR/*.traineddata' | ||||
|   annex_get 'textline_detection/*.h5' | ||||
|   annex_get 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' | ||||
|   annex_get 'sbb_binarization/*.h5' | ||||
| } | ||||
| get_from_web() { | ||||
|   download_to 'https://qurator-data.de/calamari-models/GT4HistOCR/model.tar.xz' 'calamari-models/GT4HistOCR/2019-07-22T15_49+0200' | ||||
|   download_to 'https://qurator-data.de/tesseract-models/GT4HistOCR/models.tar'  'tesseract-models/GT4HistOCR' | ||||
|   download_to 'https://qurator-data.de/sbb_textline_detector/models.tar.gz'     'textline_detection' | ||||
|   download_to --strip-components 1 'https://qurator-data.de/sbb_binarization/models.tar.gz' 'sbb_binarization' | ||||
|   download_to --no-unpack 'https://qurator-data.de/mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' 'mirror/github.com/tesseract-ocr/tessdata_best/archive/4.0.0-repacked.tar.gz' | ||||
| } | ||||
| . $self_dir/qurator_data_lib.sh | ||||
|  | @ -27,3 +29,4 @@ docker build --cache-from=my_ocrd_workflow-dinglehopper          -t my_ocrd_work | |||
| docker build --cache-from=my_ocrd_workflow-ocrd_olena            -t my_ocrd_workflow-ocrd_olena            -f Dockerfile-ocrd_olena            . | ||||
| docker build --cache-from=my_ocrd_workflow-ocrd_tesserocr        -t my_ocrd_workflow-ocrd_tesserocr        -f Dockerfile-ocrd_tesserocr        . | ||||
| docker build --cache-from=my_ocrd_workflow-sbb_textline_detector -t my_ocrd_workflow-sbb_textline_detector -f Dockerfile-sbb_textline_detector . | ||||
| docker build --cache-from=my_ocrd_workflow-sbb_binarization      -t my_ocrd_workflow-sbb_binarization      -f Dockerfile-sbb_binarization      . | ||||
|  |  | |||
							
								
								
									
										2
									
								
								data
									
										
									
									
									
								
							
							
						
						
									
										2
									
								
								data
									
										
									
									
									
								
							|  | @ -1 +1 @@ | |||
| Subproject commit 0cc78464e74fd295f00ee28a6f605ce0856d3db0 | ||||
| Subproject commit bd1628e1aefbb0766655ee3726b4b0234d20a4f1 | ||||
|  | @ -58,6 +58,7 @@ main() { | |||
|   do_validate | ||||
| 
 | ||||
| 
 | ||||
|   #ocrd-sbb-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P model "/var/lib/sbb_binarization" | ||||
|   ocrd-olena-binarize --overwrite -I $INPUT_FILE_GRP -O OCR-D-IMG-BIN -P impl "sauvola-ms-split" | ||||
|   do_validate | ||||
| 
 | ||||
|  |  | |||
|  | @ -62,13 +62,37 @@ annex_get() { | |||
|   ) | ||||
| } | ||||
| 
 | ||||
| # Options: | ||||
| # --no-unpack                Do NOT unpack the file | ||||
| # --strip-components NUMBER  (as tar's option) | ||||
| download_to() { | ||||
|   unpack=1 | ||||
|   if [[ "$1" = '--no-unpack' ]]; then | ||||
|     unpack=0 | ||||
|     shift | ||||
|   tar_options="" | ||||
| 
 | ||||
|   _options=$(getopt --long no-unpack,strip-components: -- "" "$@") | ||||
|   if [[ $? != 0 ]]; then | ||||
|     echo "Bad parameters for download_to" >&2 | ||||
|     exit 1 | ||||
|   fi | ||||
|   file_pattern="$1" | ||||
|   eval set -- "$_options" | ||||
|   while true; do | ||||
|     case "$1" in | ||||
|     --no-unpack) | ||||
|       unpack=0 | ||||
|       ;; | ||||
|     --strip-components) | ||||
|       shift | ||||
|       components=$1 | ||||
|       tar_options="$tar_options --strip-components $components" | ||||
|       ;; | ||||
|     --) | ||||
|       shift | ||||
|       break | ||||
|       ;; | ||||
|     esac | ||||
|     shift | ||||
|   done | ||||
| 
 | ||||
|   download_source="$1" | ||||
|   dest="$2" | ||||
| 
 | ||||
|  | @ -79,7 +103,7 @@ download_to() { | |||
|     if [[ $unpack = 1 ]]; then | ||||
|       mkdir -p "$dest" | ||||
|       # Unpacking relies on tar -a unpacking any tar compression | ||||
|       tar -C "$dest" -af $tmpf -xv | ||||
|       tar -C "$dest" $tar_options -af $tmpf -xv | ||||
|       rm -f $tmpf | ||||
|     else | ||||
|       dest_dir=`dirname "$dest"` | ||||
|  |  | |||
							
								
								
									
										1
									
								
								run
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								run
									
										
									
									
									
								
							|  | @ -37,6 +37,7 @@ build_alias() { | |||
| shopt -s expand_aliases  # Required for non-interactive shells | ||||
| build_alias ocrd                       ${DOCKER_IMAGE_PREFIX}-core:${DOCKER_IMAGE_TAG} | ||||
| build_alias ocrd-olena-binarize        ${DOCKER_IMAGE_PREFIX}-ocrd_olena:${DOCKER_IMAGE_TAG} | ||||
| build_alias ocrd-sbb-binarize          ${DOCKER_IMAGE_PREFIX}-sbb_binarization:${DOCKER_IMAGE_TAG} | ||||
| build_alias ocrd-sbb-textline-detector ${DOCKER_IMAGE_PREFIX}-sbb_textline_detector:${DOCKER_IMAGE_TAG} | ||||
| build_alias ocrd-calamari-recognize    ${DOCKER_IMAGE_PREFIX}-ocrd_calamari:${DOCKER_IMAGE_TAG} | ||||
| build_alias ocrd-tesserocr-recognize   ${DOCKER_IMAGE_PREFIX}-ocrd_tesserocr:${DOCKER_IMAGE_TAG} | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue