From 5e1ece4877ae07b806e3f53cdf6bc2ec2f7f1874 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 24 Jun 2019 12:13:49 +0200 Subject: [PATCH] Use ocrd-tesserocr-segment-* --- my_ocrd_workflow | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 7958e15..df5f599 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -40,18 +40,24 @@ do_fontident() { } do_linesegmentation() { + remove_filegrp OCR-D-SEG-REGION mets.xml remove_filegrp OCR-D-SEG-LINE mets.xml - ocrd-ocropy-segment -l $LOG_LEVEL \ - -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE + #ocrd-ocropy-segment -l $LOG_LEVEL \ + # -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-LINE + # XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd + #ocrd workspace validate mets.xml + + ocrd-tesserocr-segment-region -l $LOG_LEVEL \ + -m mets.xml -I OCR-D-IMG -O OCR-D-SEG-REGION #ocrd workspace validate mets.xml + ocrd-tesserocr-segment-line -l $LOG_LEVEL \ + -m mets.xml -I OCR-D-SEG-REGION -O OCR-D-SEG-LINE + #ocrd workspace validate mets.xml + # XXX compare ocrd-tesserocr-segment* vs tesseract native # XXX This leaves copies of the images at the top level of the workspace, because it "downloads" the "remote" files. - # Clean it up. + # Clean it up. (Maybe only affects ocrd-ocropy-segment) find . -maxdepth 1 -name "OCR-D-IMG*" -type f -exec rm -v {} \; - - # XXX ocrd-tesserocr-segment-line does not seem to produce any line segmentation - # XXX mv {ocrd-ocropy-segment,-line} - # XXX ocrd-ocropy-segment throws an exception for buerger_gedichte_1778.ocrd } do_ocr() {