mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-30 02:34:13 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			138 lines
		
	
	
	
		
			3.5 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			138 lines
		
	
	
	
		
			3.5 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
| #!/bin/bash
 | |
| 
 | |
| set -e  # Abort on error
 | |
| 
 | |
| # Set up logging
 | |
| export LOG_LEVEL=${LOG_LEVEL:-DEBUG}  # /etc/ocrd_logging.py uses this to set level for all OCR-D modules
 | |
| if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then
 | |
|   set -x
 | |
| fi
 | |
| 
 | |
| 
 | |
| do_validate() {
 | |
|   validate_options="
 | |
|     --skip dimension
 | |
|     --skip pixel_density
 | |
|     --page-strictness lax
 | |
|     --page-coordinate-consistency off"
 | |
|   ocrd workspace validate $validate_options
 | |
|   # XXX ocrd-tesserocr INCONSISTENCY in TextRegion → use "--page-strictness lax" for now
 | |
|   # XXX INVALIDITY in Glyph ID etc. in GT → --page-coordinate-consistency off
 | |
| }
 | |
| 
 | |
| do_binarization() {
 | |
|   # Binarize the images
 | |
| 
 | |
|   ocrd_olena_binarize_parameters='{"impl": "sauvola-ms-split"}'
 | |
|   ocrd workspace remove-group -rf OCR-D-IMG-BINPAGE
 | |
|   ocrd workspace remove-group -rf OCR-D-IMG-BIN
 | |
|   ocrd-olena-binarize \
 | |
|     -I OCR-D-IMG -O OCR-D-IMG-BINPAGE \
 | |
|     -p "$ocrd_olena_binarize_parameters"
 | |
| }
 | |
| 
 | |
| do_linesegmentation_tesserocr() {
 | |
|   # Segment the lines in the binarized images
 | |
| 
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-REGION
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-LINE
 | |
|   ocrd-tesserocr-segment-region \
 | |
|     -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-REGION
 | |
|   ocrd-tesserocr-segment-line \
 | |
|     -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
 | |
| 
 | |
|   # XXX compare ocrd-tesserocr-segment* vs tesseract native
 | |
| }
 | |
| 
 | |
| do_linesegmentation_sbb() {
 | |
|   # Segment the lines in the images
 | |
| 
 | |
|   ocrd_sbb_textline_detector_parameters='{"model": "/var/lib/textline_detection"}'
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-REGION
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-LINE
 | |
|   ocrd-sbb-textline-detector \
 | |
|     -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-LINE \
 | |
|     -p "$ocrd_sbb_textline_detector_parameters"
 | |
| }
 | |
| 
 | |
| do_ocr() {
 | |
|   # Perform OCR on the segmented lines
 | |
| 
 | |
|   ocrd_tesserocr_recognize_parameters='{ "model": "GT4HistOCR_2000000" }'  # TODO mods:language + fontident → model
 | |
|   ocrd workspace remove-group -rf OCR-D-OCR-TESS
 | |
|   ocrd-tesserocr-recognize \
 | |
|     -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS \
 | |
|     -p "$ocrd_tesserocr_recognize_parameters"
 | |
| }
 | |
| 
 | |
| do_ocr_calamari() {
 | |
|   ocrd_calamari_recognize_parameters='{
 | |
|     "checkpoint": "/var/lib/calamari-models/GT4HistOCR/2019-07-22T15:49+0200/*.ckpt.json",
 | |
|     "textequiv_level": "line"
 | |
|   }'
 | |
|   ocrd workspace remove-group -rf OCR-D-OCR-CALAMARI
 | |
|   ocrd-calamari-recognize \
 | |
|     -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \
 | |
|     -p "$ocrd_calamari_recognize_parameters"
 | |
| }
 | |
| 
 | |
| page_validate_xml() {
 | |
|   # Validate all PAGE XML against the XML schema
 | |
| 
 | |
|   filegrp=$1
 | |
| 
 | |
|   local file
 | |
|   for file in `ocrd workspace find -G $filegrp`; do
 | |
|     XSD_DIR=`dirname $0`/xsd
 | |
|     if [ ! -d "$XSD_DIR" ]; then
 | |
|       XSD_DIR=/usr/share/xml
 | |
|     fi
 | |
|     xmllint --noout --schema $XSD_DIR/pagecontent.2019-07-15.xsd $file
 | |
|   done
 | |
| }
 | |
| 
 | |
| page_upgrade_to_2019() {
 | |
|   filegrp=$1
 | |
| 
 | |
|   local file
 | |
|   for file in `ocrd workspace find -G $filegrp`; do
 | |
|     sed -i 's#pagecontent/[0-9-]*#pagecontent/2019-07-15#g' $file
 | |
|   done
 | |
| }
 | |
| 
 | |
| pip3 list
 | |
| 
 | |
| 
 | |
| do_binarization
 | |
| do_validate
 | |
| 
 | |
| 
 | |
| do_linesegmentation_sbb
 | |
| page_upgrade_to_2019             OCR-D-SEG-LINE
 | |
| page_validate_xml                OCR-D-SEG-REGION
 | |
| page_validate_xml                OCR-D-SEG-LINE
 | |
| do_validate
 | |
| 
 | |
| 
 | |
| do_ocr_calamari
 | |
| 
 | |
| 
 | |
| do_ocr
 | |
| 
 | |
| 
 | |
| for ocr_filegrp in OCR-D-OCR-CALAMARI OCR-D-OCR-TESS; do
 | |
| 
 | |
|   page_validate_xml           $ocr_filegrp
 | |
|   do_validate
 | |
| 
 | |
|   page_validate_xml           $ocr_filegrp
 | |
|   do_validate
 | |
| 
 | |
|   if ocrd workspace list-group | grep -q OCR-D-GT-PAGE; then
 | |
|     ocrd workspace remove-group -rf $ocr_filegrp-EVAL
 | |
|     ocrd-dinglehopper -I OCR-D-GT-PAGE,$ocr_filegrp -O $ocr_filegrp-EVAL
 | |
|   fi
 | |
| 
 | |
| done
 | |
| 
 | |
| # vim:tw=120:
 |