ocrd-galley/my_ocrd_workflow

#!/bin/bash
LOG_LEVEL=${LOG_LEVEL:-DEBUG}

set -e  # Abort on error
if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then
  set -x
fi


remove_filegrp() {
  # Remove the given file group from the workspace

  filegrp_use=$1
  mets=$2

  xmlstarlet ed --inplace \
    -N mets=http://www.loc.gov/METS/ \
    -d "//mets:fileGrp[@USE='$filegrp_use']" $mets

  # XXX See also https://github.com/OCR-D/core/issues/245
  # XXX This should also delete the files (after checking if they are indeed inside the workspace) and the directory
}

do_validate() {
  ocrd workspace validate --skip pixel_density --page-strictness lax mets.xml
  # XXX ocrd-tesserocr INCONSISTENCY in TextRegion → use "--page-strictness lax" for now

  # XXX
  if test -d TEMP; then
    echo "TEMP exists!"
    rm -rf TEMP
  fi
}

do_binarization() {
  # Binarize the images

  remove_filegrp OCR-D-IMG-BIN mets.xml
  ocrd-olena-binarize -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN \
    -p <(echo '{"impl": "sauvola-ms-split"}')
}

do_fontident() {
  # Identify fonts in the images

  network=`python3 -c "import ocrd_typegroups_classifier, os; print(os.path.join(os.path.dirname(ocrd_typegroups_classifier.__file__), 'models', 'classifier.tgc'))"`
  ocrd_typegroups_classifier_parameters="
  {
    \"network\": \"$network\",
    \"stride\":  143
  }"

  remove_filegrp OCR-D-OCR-FONTIDENT mets.xml
  ocrd-typegroups-classifier -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-IMG -O OCR-D-OCR-FONTIDENT \
    -p <(echo $ocrd_typegroups_classifier_parameters)
  # XXX Check if ocrd-typegroups-classifier uses the whole image
  # XXX does DEFAULT have any meaning? /buerger_gedichte_1778.ocrd does not have
  #     any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier
}

do_linesegmentation_tesserocr() {
  # Segment the lines in the binarized images

  remove_filegrp OCR-D-SEG-REGION mets.xml
  remove_filegrp OCR-D-SEG-LINE mets.xml
  #ocrd-ocropy-segment -l $LOG_LEVEL \
  #  -m mets.xml -I OCR-D-IMG-BIN -O OCR-D-SEG-LINE
  # XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd

  ocrd-tesserocr-segment-region -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION
  ocrd-tesserocr-segment-line -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE

  # XXX compare ocrd-tesserocr-segment* vs tesseract native
}

do_linesegmentation_sbb() {
  # Segment the lines in the images

  remove_filegrp OCR-D-SEG-REGION mets.xml
  remove_filegrp OCR-D-SEG-LINE mets.xml
  ocrd_sbb_textline_detector -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE \
    -p '{"model": "/var/lib/textline_detection"}'
}

do_ocr() {
  # Perform OCR on the segmented lines

  ocrd_tesserocr_recognize_parameters='{ "model": "GT4HistOCR_2000000" }'  # TODO mods:language + fontident → model
  remove_filegrp OCR-D-OCR-TESS mets.xml
  ocrd-tesserocr-recognize -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS \
    -p <(echo $ocrd_tesserocr_recognize_parameters)
}

do_ocr_calamari() {
  ocrd_calamari_recognize_parameters='{ "checkpoint": "/var/lib/calamari-models/GT4HistOCR/*.ckpt.json" }'
  remove_filegrp OCR-D-OCR-CALAMARI mets.xml
  ocrd-calamari-recognize -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \
    -p <(echo $ocrd_calamari_recognize_parameters)
}

page_validate_xml() {
  # Validate all PAGE XML against the XML schema

  filegrp=$1

  local file
  for file in `ocrd workspace find -G $filegrp`; do
    XSD_DIR=`dirname $0`/xsd
    if [ ! -d "$XSD_DIR" ]; then
      XSD_DIR=/usr/share/xml
    fi
    xmllint --noout --schema $XSD_DIR/pagecontent.2019-07-15.xsd $file
  done
}

page_fix_image_references_to_bin() {
  # Make image references point to the binarized images
  # XXX This is a hack, it is probably better to use alternative images in ocrd_calamari

  filegrp=$1

  local file
  for file in `ocrd workspace find -G $filegrp`; do
    # Arrays with filenames to the images
    imgs=(`ocrd workspace find -G OCR-D-IMG`)
    imgs_bin=(`ocrd workspace find -G OCR-D-IMG-BIN -m image/png`)

    # Change all image references to point to the corresponding binarized image
    for i in ${!imgs[@]}; do
      sed -i "s!imageFilename=.${imgs[$i]}.!imageFilename=\"${imgs_bin[$i]}\"!g" $file
    done
  done
}

page_downgrade_to_2018() {
  # Not used anymore, but kept if needed in the future
  filegrp=$1

  local file
  for file in `ocrd workspace find -G $filegrp`; do
    sed -i 's#pagecontent/[0-9-]*#pagecontent/2018-07-15#g' $file
  done
}

page_upgrade_to_2019() {
  filegrp=$1

  local file
  for file in `ocrd workspace find -G $filegrp`; do
    sed -i 's#pagecontent/[0-9-]*#pagecontent/2019-07-15#g' $file
  done
}

pip3 list


#do_fontident
#do_validate


do_binarization
do_validate


do_linesegmentation_sbb
page_fix_image_references_to_bin OCR-D-SEG-LINE
page_upgrade_to_2019             OCR-D-SEG-LINE
page_validate_xml                OCR-D-SEG-REGION
page_validate_xml                OCR-D-SEG-LINE
do_validate


do_ocr_calamari


do_ocr


for ocr_filegrp in OCR-D-OCR-CALAMARI OCR-D-OCR-TESS; do

  page_validate_xml           $ocr_filegrp
  do_validate

  page_validate_xml           $ocr_filegrp
  do_validate

  # XXX This seems to be causing new problems with validation
  # https://github.com/OCR-D/core/issues/176
  #page_fix_image_references   $ocr_filegrp
  #do_validate

  if ocrd workspace list-group | grep -q OCR-D-GT-PAGE; then
    remove_filegrp $ocr_filegrp-EVAL mets.xml
    ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,$ocr_filegrp -O $ocr_filegrp-EVAL
  fi

done

# vim:tw=120:
Initial commit 2019-06-19 12:22:41 +02:00			`#!/bin/bash`
🔧 Allow setting LOG_LEVEL 2019-09-27 12:09:37 +02:00			`LOG_LEVEL=${LOG_LEVEL:-DEBUG}`
Initial commit 2019-06-19 12:22:41 +02:00
			`set -e # Abort on error`
Add a global LOG_LEVEL option 2019-06-19 17:48:38 +02:00			`if [ "$LOG_LEVEL" = "DEBUG" -o "$LOG_LEVEL" = "TRACE" ]; then`
			`set -x`
			`fi`
Initial commit 2019-06-19 12:22:41 +02:00

			`remove_filegrp() {`
📝 Document the functions 2019-07-03 12:22:55 +02:00			`# Remove the given file group from the workspace`

Initial commit 2019-06-19 12:22:41 +02:00			`filegrp_use=$1`
			`mets=$2`

			`xmlstarlet ed --inplace \`
			`-N mets=http://www.loc.gov/METS/ \`
			`-d "//mets:fileGrp[@USE='$filegrp_use']" $mets`
XXX remove_filegrp link to OCR-D issue 2019-06-21 12:10:19 +02:00
			`# XXX See also https://github.com/OCR-D/core/issues/245`
			`# XXX This should also delete the files (after checking if they are indeed inside the workspace) and the directory`
Initial commit 2019-06-19 12:22:41 +02:00			`}`

✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00			`do_validate() {`
🧹 Validate imagefilename again 2019-10-30 11:25:34 +01:00			`ocrd workspace validate --skip pixel_density --page-strictness lax mets.xml`
✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00			`# XXX ocrd-tesserocr INCONSISTENCY in TextRegion → use "--page-strictness lax" for now`
💩 Remove mysterious TEMP directory for now 2019-09-26 16:55:54 +02:00
			`# XXX`
			`if test -d TEMP; then`
			`echo "TEMP exists!"`
			`rm -rf TEMP`
			`fi`
✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00			`}`

Binarize images before segmenting 2019-06-24 12:34:08 +02:00			`do_binarization() {`
📝 Document the functions 2019-07-03 12:22:55 +02:00			`# Binarize the images`

Binarize images before segmenting 2019-06-24 12:34:08 +02:00			`remove_filegrp OCR-D-IMG-BIN mets.xml`
✨ Use ocrd_olena for binarization 2019-10-21 17:04:06 +02:00			`ocrd-olena-binarize -l $LOG_LEVEL \`
			`-m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN \`
			`-p <(echo '{"impl": "sauvola-ms-split"}')`
Binarize images before segmenting 2019-06-24 12:34:08 +02:00			`}`

Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00			`do_fontident() {`
📝 Document the functions 2019-07-03 12:22:55 +02:00			`# Identify fonts in the images`

Do not hardcode path to typegroups model binary 2019-06-24 17:31:25 +02:00			network=`python3 -c "import ocrd_typegroups_classifier, os; print(os.path.join(os.path.dirname(ocrd_typegroups_classifier.__file__), 'models', 'classifier.tgc'))"`
			`ocrd_typegroups_classifier_parameters="`
Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00			`{`
Do not hardcode path to typegroups model binary 2019-06-24 17:31:25 +02:00			`\"network\": \"$network\",`
			`\"stride\": 143`
			`}"`
📝 Document the functions 2019-07-03 12:22:55 +02:00
🐛 Use a valid filegrp USE for fontident 2019-08-05 17:38:24 +02:00			`remove_filegrp OCR-D-OCR-FONTIDENT mets.xml`
Add a global LOG_LEVEL option 2019-06-19 17:48:38 +02:00			`ocrd-typegroups-classifier -l $LOG_LEVEL \`
🐛 Use a valid filegrp USE for fontident 2019-08-05 17:38:24 +02:00			`-m mets.xml -I OCR-D-IMG -O OCR-D-OCR-FONTIDENT \`
Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00			`-p <(echo $ocrd_typegroups_classifier_parameters)`
Add a global LOG_LEVEL option 2019-06-19 17:48:38 +02:00			`# XXX Check if ocrd-typegroups-classifier uses the whole image`
Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00			`# XXX does DEFAULT have any meaning? /buerger_gedichte_1778.ocrd does not have`
			`# any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier`
			`}`

✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00			`do_linesegmentation_tesserocr() {`
📝 Document the functions 2019-07-03 12:22:55 +02:00			`# Segment the lines in the binarized images`

Use ocrd-tesserocr-segment-* 2019-06-24 12:13:49 +02:00			`remove_filegrp OCR-D-SEG-REGION mets.xml`
Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00			`remove_filegrp OCR-D-SEG-LINE mets.xml`
Use ocrd-tesserocr-segment-* 2019-06-24 12:13:49 +02:00			`#ocrd-ocropy-segment -l $LOG_LEVEL \`
Binarize images before segmenting 2019-06-24 12:34:08 +02:00			`# -m mets.xml -I OCR-D-IMG-BIN -O OCR-D-SEG-LINE`
Use ocrd-tesserocr-segment-* 2019-06-24 12:13:49 +02:00			`# XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd`

			`ocrd-tesserocr-segment-region -l $LOG_LEVEL \`
Binarize images before segmenting 2019-06-24 12:34:08 +02:00			`-m mets.xml -I OCR-D-IMG-BIN -O OCR-D-SEG-REGION`
Use ocrd-tesserocr-segment-* 2019-06-24 12:13:49 +02:00			`ocrd-tesserocr-segment-line -l $LOG_LEVEL \`
			`-m mets.xml -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE`
✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00
Use ocrd-tesserocr-segment-* 2019-06-24 12:13:49 +02:00			`# XXX compare ocrd-tesserocr-segment* vs tesseract native`
Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00			`}`

✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00			`do_linesegmentation_sbb() {`
			`# Segment the lines in the images`

			`remove_filegrp OCR-D-SEG-REGION mets.xml`
			`remove_filegrp OCR-D-SEG-LINE mets.xml`
			`ocrd_sbb_textline_detector -l $LOG_LEVEL \`
			`-m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE \`
			`-p '{"model": "/var/lib/textline_detection"}'`
			`}`

Add OCR step 2019-06-19 13:02:54 +02:00			`do_ocr() {`
📝 Document the functions 2019-07-03 12:22:55 +02:00			`# Perform OCR on the segmented lines`

✨ Use GT4HistOCR_2000000 model from qurator-data for Tesseract 2019-10-02 16:48:28 +02:00			`ocrd_tesserocr_recognize_parameters='{ "model": "GT4HistOCR_2000000" }' # TODO mods:language + fontident → model`
Add OCR step 2019-06-19 13:02:54 +02:00			`remove_filegrp OCR-D-OCR-TESS mets.xml`
Add a global LOG_LEVEL option 2019-06-19 17:48:38 +02:00			`ocrd-tesserocr-recognize -l $LOG_LEVEL \`
Add OCR step 2019-06-19 13:02:54 +02:00			`-m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS \`
			`-p <(echo $ocrd_tesserocr_recognize_parameters)`
			`}`
Refactor: Extract functions for the steps 2019-06-19 12:51:52 +02:00
✨ Run Calamari OCR 2019-08-21 11:54:01 +02:00			`do_ocr_calamari() {`
			`ocrd_calamari_recognize_parameters='{ "checkpoint": "/var/lib/calamari-models/GT4HistOCR/*.ckpt.json" }'`
			`remove_filegrp OCR-D-OCR-CALAMARI mets.xml`
			`ocrd-calamari-recognize -l $LOG_LEVEL \`
			`-m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-CALAMARI \`
			`-p <(echo $ocrd_calamari_recognize_parameters)`
			`}`

Add a PAGE validation step 2019-06-19 14:55:50 +02:00			`page_validate_xml() {`
📝 Document the functions 2019-07-03 12:22:55 +02:00			`# Validate all PAGE XML against the XML schema`

Add a PAGE validation step 2019-06-19 14:55:50 +02:00			`filegrp=$1`

			`local file`
			for file in `ocrd workspace find -G $filegrp`; do
🎨 Move XML schemata to a better path 2019-09-30 18:25:54 +02:00			XSD_DIR=`dirname $0`/xsd
			`if [ ! -d "$XSD_DIR" ]; then`
			`XSD_DIR=/usr/share/xml`
			`fi`
			`xmllint --noout --schema $XSD_DIR/pagecontent.2019-07-15.xsd $file`
Add a PAGE validation step 2019-06-19 14:55:50 +02:00			`done`
			`}`

✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00			`page_fix_image_references_to_bin() {`
			`# Make image references point to the binarized images`
			`# XXX This is a hack, it is probably better to use alternative images in ocrd_calamari`
XXX Global -l DEBUG 2019-06-19 13:26:28 +02:00
🚧 Work around problems with ocrd-tesserocr producing TextEquiv/@conf 2019-08-05 15:40:39 +02:00			`filegrp=$1`

			`local file`
			for file in `ocrd workspace find -G $filegrp`; do
✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00			`# Arrays with filenames to the images`
			imgs=(`ocrd workspace find -G OCR-D-IMG`)
✨ Use ocrd_olena for binarization 2019-10-21 17:04:06 +02:00			imgs_bin=(`ocrd workspace find -G OCR-D-IMG-BIN -m image/png`)
✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00
			`# Change all image references to point to the corresponding binarized image`
			`for i in ${!imgs[@]}; do`
			`sed -i "s!imageFilename=.${imgs[$i]}.!imageFilename=\"${imgs_bin[$i]}\"!g" $file`
			`done`
🚧 Work around problems with ocrd-tesserocr producing TextEquiv/@conf 2019-08-05 15:40:39 +02:00			`done`
			`}`

✨ As a last step, downgrade to PAGE 2018 to support PAGE Viewer 2019-08-05 18:46:36 +02:00			`page_downgrade_to_2018() {`
⬆ Do not downgrade to PAGE 2018 anymore 2019-09-27 13:02:46 +02:00			`# Not used anymore, but kept if needed in the future`
✨ As a last step, downgrade to PAGE 2018 to support PAGE Viewer 2019-08-05 18:46:36 +02:00			`filegrp=$1`

			`local file`
			for file in `ocrd workspace find -G $filegrp`; do
			`sed -i 's#pagecontent/[0-9-]*#pagecontent/2018-07-15#g' $file`
			`done`
			`}`

✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00			`page_upgrade_to_2019() {`
			`filegrp=$1`

			`local file`
			for file in `ocrd workspace find -G $filegrp`; do
			`sed -i 's#pagecontent/[0-9-]*#pagecontent/2019-07-15#g' $file`
			`done`
			`}`
🚧 Work around problems with ocrd-tesserocr producing TextEquiv/@conf 2019-08-05 15:40:39 +02:00
✨ Run pip3 list for easier checking 2019-09-27 13:16:14 +02:00			`pip3 list`

🎨 Add extra newlines to separate steps 2019-09-30 12:26:14 +02:00
🚑 Don't install typegroups classifier for now 2019-08-16 18:23:15 +02:00			`#do_fontident`
			`#do_validate`
Binarize images before segmenting 2019-06-24 12:34:08 +02:00
🎨 Add extra newlines to separate steps 2019-09-30 12:26:14 +02:00
Binarize images before segmenting 2019-06-24 12:34:08 +02:00			`do_binarization`
✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00			`do_validate`

🎨 Add extra newlines to separate steps 2019-09-30 12:26:14 +02:00
✨ Use sbb_textline_detector to segment lines 2019-10-11 19:16:43 +02:00			`do_linesegmentation_sbb`
			`page_fix_image_references_to_bin OCR-D-SEG-LINE`
			`page_upgrade_to_2019 OCR-D-SEG-LINE`
			`page_validate_xml OCR-D-SEG-REGION`
			`page_validate_xml OCR-D-SEG-LINE`
✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00			`do_validate`

🔧 Evaluate both Tesseract and Calamari results 2019-08-21 13:07:27 +02:00
✨ Run Calamari OCR 2019-08-21 11:54:01 +02:00			`do_ocr_calamari`

🔧 Evaluate both Tesseract and Calamari results 2019-08-21 13:07:27 +02:00
Add OCR step 2019-06-19 13:02:54 +02:00			`do_ocr`
✨ Validate workspace after each step 2019-08-05 15:27:38 +02:00
Initial commit 2019-06-19 12:22:41 +02:00
🔧 Evaluate both Tesseract and Calamari results 2019-08-21 13:07:27 +02:00			`for ocr_filegrp in OCR-D-OCR-CALAMARI OCR-D-OCR-TESS; do`
✨ As a last step, downgrade to PAGE 2018 to support PAGE Viewer 2019-08-05 18:46:36 +02:00
🔧 Evaluate both Tesseract and Calamari results 2019-08-21 13:07:27 +02:00			`page_validate_xml $ocr_filegrp`
			`do_validate`
🚧 Use ocr-eval aka dinglehopper 2019-08-13 18:13:49 +02:00
🔧 Evaluate both Tesseract and Calamari results 2019-08-21 13:07:27 +02:00			`page_validate_xml $ocr_filegrp`
			`do_validate`

💩 Do not fix PAGE image references for now 2019-09-26 16:46:12 +02:00			`# XXX This seems to be causing new problems with validation`
			`# https://github.com/OCR-D/core/issues/176`
			`#page_fix_image_references $ocr_filegrp`
			`#do_validate`

🔧 Evaluate both Tesseract and Calamari results 2019-08-21 13:07:27 +02:00			`if ocrd workspace list-group \| grep -q OCR-D-GT-PAGE; then`
			`remove_filegrp $ocr_filegrp-EVAL mets.xml`
			`ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,$ocr_filegrp -O $ocr_filegrp-EVAL`
			`fi`

			`done`
🚧 Use ocr-eval aka dinglehopper 2019-08-13 18:13:49 +02:00
Reformat to use shorter lines 2019-06-19 12:39:42 +02:00			`# vim:tw=120:`