#!/bin/bash set -e # Abort on error set -x remove_filegrp() { filegrp_use=$1 mets=$2 xmlstarlet ed --inplace \ -N mets=http://www.loc.gov/METS/ \ -d "//mets:fileGrp[@USE='$filegrp_use']" $mets } do_fontident() { ocrd_typegroups_classifier_parameters=' { "network": "/home/mike/devel/OCR-D/monorepo/ocrd_typegroups_classifier/ocrd_typegroups_classifier/models/classifier.tgc", "stride":143 }' remove_filegrp OCR-D-FONTIDENT mets.xml ocrd-typegroups-classifier -l DEBUG \ -m mets.xml -I OCR-D-IMG -O OCR-D-FONTIDENT \ -p <(echo $ocrd_typegroups_classifier_parameters) # XXX does DEFAULT have any meaning? /buerger_gedichte_1778.ocrd does not have # any DEFAULT, yet -I DEFAULT seems to work for ocrd-typegroups-classifier #ocrd workspace validate mets.xml # XXX Unspecified USE category 'FONTIDENT' in fileGrp 'OCR-D-FONTIDENT' # XXX File 'OCR-D-FONTIDENT_OCR-D-IMG_0002' does not manifest any physical page. # XXX Won't download remote image } do_linesegmentation() { remove_filegrp OCR-D-SEG-LINE mets.xml ocrd-ocropy-segment -l DEBUG \ -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE #ocrd workspace validate mets.xml # XXX This leaves copies of the images at the top level of the workspace, because it "downloads" the "remote" files. # Clean it up. find . -maxdepth 1 -name "OCR-D-IMG*" -type f -exec rm -v {} \; # XXX ocrd-tesserocr-segment-line does not seem to produce any line segmentation # XXX mv {ocrd-ocropy-segment,-line} } do_ocr() { ocrd_tesserocr_recognize_parameters='{ "model": "eng" }' # TODO mods:language + fontident → model remove_filegrp OCR-D-OCR-TESS mets.xml ocrd-tesserocr-recognize -l DEBUG \ -m mets.xml -I OCR-D-SEG-LINE -O OCR-D-OCR-TESS \ -p <(echo $ocrd_tesserocr_recognize_parameters) #ocrd workspace validate mets.xml } do_fontident do_linesegmentation do_ocr # XXX Multiple calls create multiple identical mets:agent elements # XXX Global -l DEBUG # vim:tw=120: