🐛 Sort predictions in exactly the same way to make sure we are correctly removing spaces

2026-03-14 10:31:55 +01:00 · 2020-02-12 16:38:45 +01:00 · 2020-02-12 16:38:45 +01:00 · 0c9e1f13c7
commit 0c9e1f13c7
parent d2c843aa3f
1 changed files with 12 additions and 11 deletions
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -92,8 +92,16 @@ class CalamariRecognize(Processor):
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence
                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [c for c in chars if c.char]  # XXX Note that omission probabilities are not normalized?!
                        chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
                        return chars
                    def _drop_leading_spaces(positions):
-                        return list(itertools.dropwhile(lambda p: p.chars[0].char == " ", positions))
+                        return list(itertools.dropwhile(lambda p: _sort_chars(p)[0].char == " ", positions))
                    def _drop_trailing_spaces(positions):
                        return list(reversed(_drop_leading_spaces(reversed(positions))))
                    def _drop_double_spaces(positions):
@ -184,17 +192,10 @@ class CalamariRecognize(Processor):
                                        glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
-                                        # Filter predictions
+                                        # Add predictions (= TextEquivs)
-                                        chars = p.chars
+                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
-                                        chars = [c for c in chars if c.char]  # XXX Note that omission probabilities are not normalized?!
+                                        for char_index, char in enumerate(_sort_chars(p), start=char_index_start):
                                        chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
                                        # Sort and add predictions (= TextEquivs)
                                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
                                        char_index = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char in chars:
                                            glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
                                            char_index += 1
                                        word.add_Glyph(glyph)