From e3a1afbc938ef9068bcd45031bac56e8c7dbdfc4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 3 Jul 2019 12:22:55 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Document=20the=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- my_ocrd_workflow | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/my_ocrd_workflow b/my_ocrd_workflow index 8dad226..c264fb5 100755 --- a/my_ocrd_workflow +++ b/my_ocrd_workflow @@ -9,6 +9,8 @@ fi remove_filegrp() { + # Remove the given file group from the workspace + filegrp_use=$1 mets=$2 @@ -21,18 +23,23 @@ remove_filegrp() { } do_binarization() { + # Binarize the images + remove_filegrp OCR-D-IMG-BIN mets.xml ocrd-kraken-binarize -l $LOG_LEVEL \ -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN } do_fontident() { + # Identify fonts in the images + network=`python3 -c "import ocrd_typegroups_classifier, os; print(os.path.join(os.path.dirname(ocrd_typegroups_classifier.__file__), 'models', 'classifier.tgc'))"` ocrd_typegroups_classifier_parameters=" { \"network\": \"$network\", \"stride\": 143 }" + remove_filegrp OCR-D-FONTIDENT mets.xml ocrd-typegroups-classifier -l $LOG_LEVEL \ -m mets.xml -I OCR-D-IMG -O OCR-D-FONTIDENT \ @@ -47,6 +54,8 @@ do_fontident() { } do_linesegmentation() { + # Segment the lines in the binarized images + remove_filegrp OCR-D-SEG-REGION mets.xml remove_filegrp OCR-D-SEG-LINE mets.xml #ocrd-ocropy-segment -l $LOG_LEVEL \ @@ -68,6 +77,8 @@ do_linesegmentation() { } do_ocr() { + # Perform OCR on the segmented lines + ocrd_tesserocr_recognize_parameters='{ "model": "eng" }' # TODO mods:language + fontident → model remove_filegrp OCR-D-OCR-TESS mets.xml ocrd-tesserocr-recognize -l $LOG_LEVEL \ @@ -80,7 +91,11 @@ do_ocr() { } page_fix_xml() { - # XXX core does not produce valid XML https://github.com/OCR-D/core/issues/242 + # Fix the PAGE XML generated by OCR-D core + # + # XXX core does not produce valid XML (See https://github.com/OCR-D/core/issues/242), fix it by setting the correct + # PAGE XML version. This makes PAGE Viewer open the file. + filegrp=$1 local file @@ -90,6 +105,8 @@ page_fix_xml() { } page_validate_xml() { + # Validate all PAGE XML against the XML schema + filegrp=$1 local file @@ -99,8 +116,11 @@ page_validate_xml() { } page_fix_image_references() { - # Make image references relative to the PAGE XML file. The rest of OCR-D probably isn't going to like it, but it - # is a. correct and b. makes PAGE Viewer open the image file automatically. + # Make image references relative to the PAGE XML file + # + # The rest of OCR-D probably isn't going to like it, but it is a. correct and b. makes PAGE Viewer open the image file + # automatically. + filegrp=$1 local file @@ -116,7 +136,7 @@ do_binarization do_linesegmentation do_ocr page_fix_xml OCR-D-OCR-TESS -page_validate_xml OCR-D-OCR-TESS # This also makes sure PAGE Viewer can open it +page_validate_xml OCR-D-OCR-TESS page_fix_image_references OCR-D-OCR-TESS