mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-30 02:34:13 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			174 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			174 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
| #!/bin/bash
 | |
| 
 | |
| set -e  # Abort on error
 | |
| 
 | |
| # Configuration
 | |
| export LOG_LEVEL=${LOG_LEVEL:-DEBUG}  # /etc/ocrd_logging.py uses this to set level for all OCR-D modules
 | |
| export TEXTEQUIV_LEVEL=glyph
 | |
| 
 | |
| # Command line parameters
 | |
| OPTS=`getopt -o I: --long input-file-grp:,skip-validation -- "$@"`
 | |
| eval set -- "$OPTS"
 | |
| INPUT_FILE_GRP=OCR-D-IMG
 | |
| SKIP_VALIDATION=false
 | |
| while true; do
 | |
|   case "$1" in
 | |
|     -I|--input-file-grp) INPUT_FILE_GRP=$2; shift 2;;
 | |
|     --skip-validation) SKIP_VALIDATION=true; shift;;
 | |
| 
 | |
|     --) shift; break;;
 | |
|     *) break;;
 | |
|   esac
 | |
| done
 | |
| 
 | |
| # Set up logging
 | |
| if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then
 | |
|   set -x
 | |
| fi
 | |
| 
 | |
| 
 | |
| do_validate() {
 | |
|   # Validate the workspace
 | |
| 
 | |
|   # Both ocrd_tesserocr + ocrd_calamari produce segment coordinates that are not strictly within their parent's
 | |
|   # coordinates:
 | |
|   #
 | |
|   #     INCONSISTENCY in [...] coords [...] not within parent coords
 | |
|   #
 | |
|   # → --page-coordinate-consistency off
 | |
|   #
 | |
|   # ocrd_tesserocr sometimes produces segment text results that aren't concatenating as expected by the validator:
 | |
|   #
 | |
|   #     INCONSISTENCY in [...]: text results '[...]' != concatenated '[...]'
 | |
|   #
 | |
|   # → --page-strictness lax
 | |
|   #
 | |
|   validate_options='
 | |
|     --skip dimension
 | |
|     --skip pixel_density
 | |
|     --page-strictness lax
 | |
|     --page-coordinate-consistency off'
 | |
|   if [ "$SKIP_VALIDATION" = false ]; then
 | |
|     ocrd workspace validate $validate_options
 | |
|   fi
 | |
| }
 | |
| 
 | |
| do_binarization() {
 | |
|   # Binarize the images
 | |
| 
 | |
|   ocrd_olena_binarize_parameters='{
 | |
|     "impl": "sauvola-ms-split"
 | |
|   }'
 | |
|   ocrd workspace remove-group -rf OCR-D-IMG-BINPAGE
 | |
|   ocrd workspace remove-group -rf OCR-D-IMG-BIN
 | |
|   ocrd-olena-binarize -I $INPUT_FILE_GRP -O OCR-D-IMG-BINPAGE,OCR-D-IMG-BIN -p "$ocrd_olena_binarize_parameters"
 | |
| }
 | |
| 
 | |
| do_linesegmentation_tesserocr() {
 | |
|   # Segment the lines in the binarized images
 | |
| 
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-REGION
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-LINE
 | |
|   ocrd-tesserocr-segment-region -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-REGION
 | |
|   ocrd-tesserocr-segment-line -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE
 | |
| }
 | |
| 
 | |
| do_linesegmentation_sbb() {
 | |
|   # Segment the lines in the images
 | |
| 
 | |
|   ocrd_sbb_textline_detector_parameters='{
 | |
|     "model": "/var/lib/textline_detection"
 | |
|   }'
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-REGION
 | |
|   ocrd workspace remove-group -rf OCR-D-SEG-LINE
 | |
|   ocrd-sbb-textline-detector -I OCR-D-IMG-BINPAGE -O OCR-D-SEG-LINE -p "$ocrd_sbb_textline_detector_parameters"
 | |
| }
 | |
| 
 | |
| do_ocr() {
 | |
|   # Perform OCR on the segmented lines
 | |
| 
 | |
|   ocrd_tesserocr_recognize_parameters='{
 | |
|     "model": "GT4HistOCR_2000000",
 | |
|     "textequiv_level": "'$TEXTEQUIV_LEVEL'"
 | |
|   }'
 | |
|   ocrd workspace remove-group -rf OCR-D-OCR-TESS
 | |
|   ocrd-tesserocr-recognize -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS -p "$ocrd_tesserocr_recognize_parameters"
 | |
| }
 | |
| 
 | |
| do_ocr_calamari() {
 | |
|   # Perform OCR on the segmented lines
 | |
| 
 | |
|   ocrd_calamari_recognize_parameters='{
 | |
|     "checkpoint": "/var/lib/calamari-models/GT4HistOCR/2019-07-22T15:49+0200/*.ckpt.json",
 | |
|     "textequiv_level": "'$TEXTEQUIV_LEVEL'"
 | |
|   }'
 | |
|   ocrd workspace remove-group -rf OCR-D-OCR-CALAMARI
 | |
|   ocrd-calamari-recognize -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI -p "$ocrd_calamari_recognize_parameters"
 | |
| }
 | |
| 
 | |
| page_validate_xml() {
 | |
|   # Validate all PAGE XML against the XML schema
 | |
| 
 | |
|   filegrp=$1
 | |
| 
 | |
|   local file
 | |
|   for file in `ocrd workspace find -G $filegrp`; do
 | |
|     XSD_DIR=`dirname $0`/xsd
 | |
|     if [ ! -d "$XSD_DIR" ]; then
 | |
|       XSD_DIR=/usr/share/xml
 | |
|     fi
 | |
|     xmllint --noout --schema $XSD_DIR/pagecontent.2019-07-15.xsd $file
 | |
|   done
 | |
| }
 | |
| 
 | |
| page_upgrade_to_2019() {
 | |
|   # Upgrade PAGE files to 2019-07-15
 | |
| 
 | |
|   filegrp=$1
 | |
| 
 | |
|   local file
 | |
|   for file in `ocrd workspace find -G $filegrp`; do
 | |
|     sed -i 's#pagecontent/[0-9-]*#pagecontent/2019-07-15#g' $file
 | |
|   done
 | |
| }
 | |
| 
 | |
| 
 | |
| main() {
 | |
|   do_binarization
 | |
|   do_validate
 | |
| 
 | |
| 
 | |
|   do_linesegmentation_sbb
 | |
|   page_upgrade_to_2019             OCR-D-SEG-LINE
 | |
|   page_validate_xml                OCR-D-SEG-REGION
 | |
|   page_validate_xml                OCR-D-SEG-LINE
 | |
|   do_validate
 | |
| 
 | |
| 
 | |
|   do_ocr_calamari
 | |
| 
 | |
| 
 | |
|   do_ocr
 | |
| 
 | |
| 
 | |
|   for ocr_filegrp in OCR-D-OCR-CALAMARI OCR-D-OCR-TESS; do
 | |
| 
 | |
|     page_validate_xml           $ocr_filegrp
 | |
|     do_validate
 | |
| 
 | |
|     if ocrd workspace list-group | grep -q OCR-D-GT-PAGE; then
 | |
|       ocrd workspace remove-group -rf $ocr_filegrp-EVAL
 | |
|       ocrd-dinglehopper -I OCR-D-GT-PAGE,$ocr_filegrp -O $ocr_filegrp-EVAL
 | |
|     fi
 | |
| 
 | |
|   done
 | |
| }
 | |
| 
 | |
| 
 | |
| if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then
 | |
|   pip3 list
 | |
| fi
 | |
| main
 | |
| 
 | |
| 
 | |
| # vim:tw=120:
 |