#!/bin/bash set -e # Abort on error set -x remove_filegrp() { filegrp_use=$1 mets=$2 xmlstarlet ed --inplace \ -N mets=http://www.loc.gov/METS/ \ -d "//mets:fileGrp[@USE='$filegrp_use']" $mets } do_fontident() { ocrd_typegroups_classifier_parameters=' { "network": "/home/mike/devel/OCR-D/monorepo/ocrd_typegroups_classifier/ocrd_typegroups_classifier/models/classifier.tgc", "stride":143 }' remove_filegrp OCR-D-FONTIDENT mets.xml ocrd-typegroups-classifier -l DEBUG \ -m mets.xml -I OCR-D-IMG -O OCR-D-FONTIDENT \ -p <(echo $ocrd_typegroups_classifier_parameters) # XXX does DEFAULT have any meaning? /buerger_gedichte_1778.ocrd does not have # any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier #ocrd workspace validate mets.xml # XXX Unspecified USE category 'FONTIDENT' in fileGrp 'OCR-D-FONTIDENT' # XXX File 'OCR-D-FONTIDENT_OCR-D-IMG_0002' does not manifest any physical page. # XXX Won't download remote image } do_linesegmentation() { remove_filegrp OCR-D-SEG-LINE mets.xml ocrd-ocropy-segment -l DEBUG \ -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE #ocrd workspace validate mets.xml # XXX This leaves copies of the images at the top level of the workspace, because it "downloads" the "remote" files. # Clean it up. find . -maxdepth 1 -name "OCR-D-IMG*" -type f -exec rm -v {} \; # XXX ocrd-tesserocr-segment-line does not seem to produce any line segmentation # XXX mv {ocrd-ocropy-segment,-line} } do_ocr() { ocrd_tesserocr_recognize_parameters='{ "model": "eng" }' # TODO mods:language + fontident → model remove_filegrp OCR-D-OCR-TESS mets.xml ocrd-tesserocr-recognize -l DEBUG \ -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS \ -p <(echo $ocrd_tesserocr_recognize_parameters) #ocrd workspace validate mets.xml } page_fix_xml() { # XXX core does not produce valid XML https://github.com/OCR-D/core/issues/242 filegrp=$1 local file for file in `ocrd workspace find -G $filegrp`; do sed -i 's#pagecontent/2017-07-15#pagecontent/2018-07-15#g' $file done } page_validate_xml() { filegrp=$1 local file for file in `ocrd workspace find -G $filegrp`; do xmllint --noout --schema `dirname $0`/xsd/pagecontent.2018-07-15.xsd $file done } page_fix_image_references() { # Make image references relative to the PAGE XML file. The rest of OCR-D probably isn't going to like it, but it # is a. correct and b. makes PAGE Viewer open the image file automatically. filegrp=$1 local file for file in `ocrd workspace find -G $filegrp`; do sed -i 's#imageFilename="OCR-D-IMG#imageFilename="../OCR-D-IMG#g' $file done } do_fontident do_linesegmentation do_ocr page_fix_xml OCR-D-OCR-TESS page_validate_xml OCR-D-OCR-TESS # This also makes sure PAGE Viewer can open it page_fix_image_references OCR-D-OCR-TESS # XXX Multiple calls create multiple identical mets:agent elements # XXX Global -l DEBUG # vim:tw=120: