🐛 Build line text on our own

Calamari does whitespace post-processing on prediction.sentence, while it does not do the same on prediction.positions. Do it on our own to have consistency. Fixes GH-37.
2025-08-10 17:49:51 +02:00 · 2020-02-12 12:25:25 +01:00 · 2020-02-12 12:25:25 +01:00 · 5b6d8b3f41
commit 5b6d8b3f41
parent 30f7e1b246
1 changed files with 40 additions and 4 deletions
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -1,6 +1,7 @@
 from __future__ import absolute_import

 import os
+import itertools
 from glob import glob

 import numpy as np
@ -84,8 +85,39 @@ class CalamariRecognize(Processor):
                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

-                    line_text = prediction.sentence
-                    line_conf = prediction.avg_char_probability
+                    # Build line text on our own
+                    #
+                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
+                    # on prediction.positions. Do it on our own to have consistency.
+                    #
+                    # XXX Check Calamari's built-in post-processing on prediction.sentence
+
+                    def _drop_leading_spaces(positions):
+                        return list(itertools.dropwhile(lambda p: p.chars[0].char == " ", positions))
+                    def _drop_trailing_spaces(positions):
+                        return list(reversed(_drop_leading_spaces(reversed(positions))))
+                    def _drop_double_spaces(positions):
+                        def _drop_double_spaces_generator(positions):
+                            last_was_space = False
+                            for p in positions:
+                                if p.chars[0].char == " ":
+                                    if not last_was_space:
+                                        yield p
+                                    last_was_space = True
+                                else:
+                                    yield p
+                                    last_was_space = False
+                        return list(_drop_double_spaces_generator(positions))
+                    positions = prediction.positions
+                    positions = _drop_leading_spaces(positions)
+                    positions = _drop_trailing_spaces(positions)
+                    positions = _drop_double_spaces(positions)
+                    positions = list(positions)
+
+                    line_text = ''.join(p.chars[0].char for p in positions)
+                    if line_text != prediction.sentence:
+                        log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
+                                    line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
@ -96,8 +128,10 @@ class CalamariRecognize(Processor):
                    line.set_Word([])

                    # Save line results
+                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])

+
                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
@ -124,10 +158,12 @@ class CalamariRecognize(Processor):
                        word_no = 0
                        i = 0

-                        for word_text in _words(prediction.sentence):
+
+
+                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
-                                word_positions = prediction.positions[i:i+word_length]
+                                word_positions = positions[i:i+word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end