From 3bde7cb37ff54458f4f13e55ccf08f5e8b43cbda Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 26 Jun 2021 13:50:00 +0200
Subject: [PATCH 1/2] init from constructor not process(), use conventional
 name setup()

---
 ocrd_calamari/recognize.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 43b8930..9cb2238 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -39,9 +39,14 @@ class CalamariRecognize(Processor):
         kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
         kwargs['version'] = '%s (calamari %s, tensorflow %s)' % (OCRD_TOOL['version'], calamari_version, tensorflow_version)
         super(CalamariRecognize, self).__init__(*args, **kwargs)
+        if hasattr(self, 'output_file_grp'):
+            # processing context
+            self.setup()
 
-    def _init_calamari(self):
-
+    def setup(self):
+        """
+        Set up the model prior to processing.
+        """
         if not self.parameter.get('checkpoint', None) and self.parameter.get('checkpoint_dir', None):
             resolved = self.resolve_resource(self.parameter['checkpoint_dir'])
             self.parameter['checkpoint'] = '%s/*.ckpt.json' % resolved
@@ -69,8 +74,6 @@ class CalamariRecognize(Processor):
         assert_file_grp_cardinality(self.input_file_grp, 1)
         assert_file_grp_cardinality(self.output_file_grp, 1)
 
-        self._init_calamari()
-
         for (n, input_file) in enumerate(self.input_files):
             page_id = input_file.pageId or input_file.ID
             log.info("INPUT FILE %i / %s", n, page_id)

From 4c6d6655e12bc7f63b8a446eace26be2a459d357 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 26 Jun 2021 13:50:20 +0200
Subject: [PATCH 2/2] improve process() docstring

---
 ocrd_calamari/recognize.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 9cb2238..3cdd26f 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -67,7 +67,31 @@ class CalamariRecognize(Processor):
 
     def process(self):
         """
-        Performs the recognition.
+        Perform text recognition with Calamari on the workspace.
+
+        For each page of the input file group, open and deserialize input PAGE-XML
+        and its respective images. Then iterate over the element hierarchy down to
+        the line level.
+
+        For each textline, retrieve a segment image according to the layout annotation
+        (from an existing ``AlternativeImage``, or by cropping into the higher-level
+        images, and deskewing when applicable).
+
+        If the line element contained any previous text results or word segmentation,
+        delete it.
+
+        Convert the line image to a Numpy array and pass it to the recognizer. Aggregate
+        character results on the line level, stripping leading and trailing white space,
+        and selecting the best hypothesis for each position. Annotate the resulting
+        TextEquiv string and (average) confidence on the line segment.
+
+        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word
+        level segments by splitting at white space characters, using the vertical
+        line coordinates and horizontal white space boundaries. In the case of ``glyph``,
+        create glyph level segments as well, adding all alternative character hypotheses
+        down to ``glyph_conf_cutoff`` confidence threshold.
+
+        Produce a new PAGE output file by serialising the resulting hierarchy.
         """
         log = getLogger('processor.CalamariRecognize')