✨ Allow configuring a cut off confidence value for glyph alternatives

2025-08-08 16:49:54 +02:00 · 2020-02-05 13:07:56 +01:00 · 2020-02-05 13:07:56 +01:00 · b802b4deaf
commit b802b4deaf
parent e39a2bce01
3 changed files with 18 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -59,6 +59,9 @@ With `test-parameters.json`:
 }
 ~~~

+You may want to have a look at the [ocrd-tool.json](ocrd-tool.json) descriptions
+for additional parameters and default values.
+
 ## Development & Testing
 For information regarding development and testing, please see
 [README-DEV.md](README-DEV.md).
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@ -31,6 +31,12 @@
          "enum": ["line", "word", "glyph"],
          "default": "line",
          "description": "Deepest PAGE XML hierarchy level to include TextEquiv results for"
+        },
+        "glyph_conf_cutoff": {
+          "type": "number",
+          "format": "float",
+          "default": 0.001,
+          "description": "Only include glyph alternatives with confidences above this threshold"
        }
      }
    }
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -148,13 +148,17 @@ class CalamariRecognize(Processor):

                                        glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))

-                                        chars = sorted(p.chars, key=lambda k: k.probability, reverse=True)
+                                        # Filter predictions
+                                        chars = p.chars
+                                        chars = [c for c in chars if c.char]  # XXX Note that omission probabilities are not normalized?!
+                                        chars = [c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff']]
+
+                                        # Sort and add predictions (= TextEquivs)
+                                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
                                        char_index = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char in chars:
-                                            if char.char:
-                                                glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
-                                                char_index += 1
-                                                # XXX Note that omission probabilities are not normalized?!
+                                            glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=char_index, conf=char.probability))
+                                            char_index += 1

                                        word.add_Glyph(glyph)