📝 Document the functions

2026-02-06 01:22:18 +01:00 · 2019-07-03 12:22:55 +02:00 · 2019-07-03 12:22:55 +02:00 · e3a1afbc93
commit e3a1afbc93
parent 2204aee104
1 changed files with 24 additions and 4 deletions
--- a/28
+++ b/28
@ -9,6 +9,8 @@ fi


 remove_filegrp() {
+  # Remove the given file group from the workspace
+
  filegrp_use=$1
  mets=$2

@ -21,18 +23,23 @@ remove_filegrp() {
 }

 do_binarization() {
+  # Binarize the images
+
  remove_filegrp OCR-D-IMG-BIN mets.xml
  ocrd-kraken-binarize -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-IMG -O OCR-D-IMG-BIN
 }

 do_fontident() {
+  # Identify fonts in the images
+
  network=`python3 -c "import ocrd_typegroups_classifier, os; print(os.path.join(os.path.dirname(ocrd_typegroups_classifier.__file__), 'models', 'classifier.tgc'))"`
  ocrd_typegroups_classifier_parameters="
  {
    \"network\": \"$network\",
    \"stride\":  143
  }"
+
  remove_filegrp OCR-D-FONTIDENT mets.xml
  ocrd-typegroups-classifier -l $LOG_LEVEL \
    -m mets.xml -I OCR-D-IMG -O OCR-D-FONTIDENT \
@ -47,6 +54,8 @@ do_fontident() {
 }

 do_linesegmentation() {
+  # Segment the lines in the binarized images
+
  remove_filegrp OCR-D-SEG-REGION mets.xml
  remove_filegrp OCR-D-SEG-LINE mets.xml
  #ocrd-ocropy-segment -l $LOG_LEVEL \
@ -68,6 +77,8 @@ do_linesegmentation() {
 }

 do_ocr() {
+  # Perform OCR on the segmented lines
+
  ocrd_tesserocr_recognize_parameters='{ "model": "eng" }'  # TODO mods:language + fontident → model
  remove_filegrp OCR-D-OCR-TESS mets.xml
  ocrd-tesserocr-recognize -l $LOG_LEVEL \
@ -80,7 +91,11 @@ do_ocr() {
 }

 page_fix_xml() {
-  # XXX core does not produce valid XML https://github.com/OCR-D/core/issues/242
+  # Fix the PAGE XML generated by OCR-D core
+  #
+  # XXX core does not produce valid XML (See https://github.com/OCR-D/core/issues/242), fix it by setting the correct
+  # PAGE XML version. This makes PAGE Viewer open the file.
+
  filegrp=$1

  local file
@ -90,6 +105,8 @@ page_fix_xml() {
 }

 page_validate_xml() {
+  # Validate all PAGE XML against the XML schema
+
  filegrp=$1

  local file
@ -99,8 +116,11 @@ page_validate_xml() {
 }

 page_fix_image_references() {
-  # Make image references relative to the PAGE XML file. The rest of OCR-D probably isn't going to like it, but it
-  # is a. correct and b. makes PAGE Viewer open the image file automatically.
+  # Make image references relative to the PAGE XML file
+  #
+  # The rest of OCR-D probably isn't going to like it, but it is a. correct and b. makes PAGE Viewer open the image file
+  # automatically.
+
  filegrp=$1

  local file
@ -116,7 +136,7 @@ do_binarization
 do_linesegmentation
 do_ocr
 page_fix_xml              OCR-D-OCR-TESS
-page_validate_xml         OCR-D-OCR-TESS  # This also makes sure PAGE Viewer can open it
+page_validate_xml         OCR-D-OCR-TESS
 page_fix_image_references OCR-D-OCR-TESS