From e3a1afbc938ef9068bcd45031bac56e8c7dbdfc4 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 3 Jul 2019 12:22:55 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Document=20the=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 my_ocrd_workflow | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/my_ocrd_workflow b/my_ocrd_workflow
index 8dad226..c264fb5 100755
--- a/my_ocrd_workflow
+++ b/my_ocrd_workflow
@@ -9,6 +9,8 @@ fi
 
 
 remove_filegrp() {
+  # Remove the given file group from the workspace
+
   filegrp_use=$1
   mets=$2
 
@@ -21,18 +23,23 @@ remove_filegrp() {
 }
 
 do_binarization() {
+  # Binarize the images
+
   remove_filegrp OCR-D-IMG-BIN mets.xml
   ocrd-kraken-binarize -l $LOG_LEVEL \
     -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN
 }
 
 do_fontident() {
+  # Identify fonts in the images
+
   network=`python3 -c "import ocrd_typegroups_classifier, os; print(os.path.join(os.path.dirname(ocrd_typegroups_classifier.__file__), 'models', 'classifier.tgc'))"`
   ocrd_typegroups_classifier_parameters="
   {
     \"network\": \"$network\",
     \"stride\":  143
   }"
+
   remove_filegrp OCR-D-FONTIDENT mets.xml
   ocrd-typegroups-classifier -l $LOG_LEVEL \
     -m mets.xml -I OCR-D-IMG -O OCR-D-FONTIDENT \
@@ -47,6 +54,8 @@ do_fontident() {
 }
 
 do_linesegmentation() {
+  # Segment the lines in the binarized images
+
   remove_filegrp OCR-D-SEG-REGION mets.xml
   remove_filegrp OCR-D-SEG-LINE mets.xml
   #ocrd-ocropy-segment -l $LOG_LEVEL \
@@ -68,6 +77,8 @@ do_linesegmentation() {
 }
 
 do_ocr() {
+  # Perform OCR on the segmented lines
+
   ocrd_tesserocr_recognize_parameters='{ "model": "eng" }'  # TODO mods:language + fontident → model
   remove_filegrp OCR-D-OCR-TESS mets.xml
   ocrd-tesserocr-recognize -l $LOG_LEVEL \
@@ -80,7 +91,11 @@ do_ocr() {
 }
 
 page_fix_xml() {
-  # XXX core does not produce valid XML https://github.com/OCR-D/core/issues/242
+  # Fix the PAGE XML generated by OCR-D core
+  #
+  # XXX core does not produce valid XML (See https://github.com/OCR-D/core/issues/242), fix it by setting the correct
+  # PAGE XML version. This makes PAGE Viewer open the file.
+
   filegrp=$1
 
   local file
@@ -90,6 +105,8 @@ page_fix_xml() {
 }
 
 page_validate_xml() {
+  # Validate all PAGE XML against the XML schema
+
   filegrp=$1
 
   local file
@@ -99,8 +116,11 @@ page_validate_xml() {
 }
 
 page_fix_image_references() {
-  # Make image references relative to the PAGE XML file. The rest of OCR-D probably isn't going to like it, but it
-  # is a. correct and b. makes PAGE Viewer open the image file automatically.
+  # Make image references relative to the PAGE XML file
+  #
+  # The rest of OCR-D probably isn't going to like it, but it is a. correct and b. makes PAGE Viewer open the image file
+  # automatically.
+
   filegrp=$1
 
   local file
@@ -116,7 +136,7 @@ do_binarization
 do_linesegmentation
 do_ocr
 page_fix_xml              OCR-D-OCR-TESS
-page_validate_xml         OCR-D-OCR-TESS  # This also makes sure PAGE Viewer can open it
+page_validate_xml         OCR-D-OCR-TESS
 page_fix_image_references OCR-D-OCR-TESS