✨ Do word segmentation as expected by OCR-D PAGE specs

2025-07-28 03:19:54 +02:00 · 2020-02-03 19:10:16 +01:00 · 2020-02-03 19:10:16 +01:00 · 6f4736f8e4
commit 6f4736f8e4
parent 0f9c94e7dc
3 changed files with 28 additions and 25 deletions
--- a/README.md
+++ b/README.md
@ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta
 image) as its input.
 In addition to the line text it also outputs glyph segmentation including
-per-glyph confidence values and per-glyph alternative predictions as provided
+per-glyph confidence values and per-glyph alternative predictions as provided by
-by the Calamari OCR engine. Note that while Calamari does not provide word
+the Calamari OCR engine. Note that while Calamari does not provide word
-segmentation, this processor produces word segmentation inferred from Unicode
+segmentation, this processor produces word segmentation inferred from text
-text segmentation and the glyph positions. The provided glyph and word
+segmentation and the glyph positions. The provided glyph and word segmentation
-segmentation can be used for text extraction and highlighting, but is probably
+can be used for text extraction and highlighting, but is probably not useful for
-not useful for further image-based processing.
+further image-based processing.
 ## Installation
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@ -4,7 +4,6 @@ import os
 from glob import glob
 import numpy as np
 import uniseg.wordbreak
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
@ -101,26 +100,32 @@ class CalamariRecognize(Processor):
                    # Save word results
                    #
-                    # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
+                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
-                    # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
+                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
-                    # a strict hierarchy of lines > words > glyphs.
+                    # hierarchy of lines > words > glyphs.
-                    def unwanted(c):
+                    def _words(s):
-                        """
+                        """Split words based on spaces and include spaces as 'words'"""
-                        Define unwanted characters
+                        spaces = None
-
+                        word = ''
-                        Words only containing these e.g. whitespace characters are not considered as words.
+                        for c in s:
-                        """
+                            if c == ' ' and spaces is True:
-                        return c == " "
+                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word
                    word_no = 0
                    i = 0
                    for word_text in uniseg.wordbreak.words(prediction.sentence):
                        # XXX Re-use word segmentation from dinglehopper, i.e. support private use characters
                        word_length = len(word_text)
                        do_not_include = all(unwanted(c) for c in word_text)
-                        if not do_not_include:
+                    for word_text in _words(prediction.sentence):
                        word_length = len(word_text)
                        if not all(c == ' ' for c in word_text):
                            word_positions = prediction.positions[i:i+word_length]
                            word_start = word_positions[0].global_start
                            word_end = word_positions[-1].global_end
@ -152,10 +157,9 @@ class CalamariRecognize(Processor):
                                word.add_Glyph(glyph)
                            line.add_Word(word)
-
+                            word_no += 1
                        i += word_length
                        word_no += 1
            _page_update_higher_textequiv_levels('line', pcgts)
--- a/requirements.txt
+++ b/requirements.txt
@ -4,4 +4,3 @@ calamari-ocr == 0.3.5
 setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime?
 click
 ocrd >= 2.2.1
 uniseg