Do word segmentation as expected by OCR-D PAGE specs

fix/readme-no-checkpoint
Gerber, Mike 5 years ago
parent 0f9c94e7dc
commit 6f4736f8e4

@ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta
image) as its input.
In addition to the line text it also outputs glyph segmentation including
per-glyph confidence values and per-glyph alternative predictions as provided
by the Calamari OCR engine. Note that while Calamari does not provide word
segmentation, this processor produces word segmentation inferred from Unicode
text segmentation and the glyph positions. The provided glyph and word
segmentation can be used for text extraction and highlighting, but is probably
not useful for further image-based processing.
per-glyph confidence values and per-glyph alternative predictions as provided by
the Calamari OCR engine. Note that while Calamari does not provide word
segmentation, this processor produces word segmentation inferred from text
segmentation and the glyph positions. The provided glyph and word segmentation
can be used for text extraction and highlighting, but is probably not useful for
further image-based processing.
## Installation

@ -4,7 +4,6 @@ import os
from glob import glob
import numpy as np
import uniseg.wordbreak
from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams
@ -101,26 +100,32 @@ class CalamariRecognize(Processor):
# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
# segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
# a strict hierarchy of lines > words > glyphs.
def unwanted(c):
"""
Define unwanted characters
Words only containing these e.g. whitespace characters are not considered as words.
"""
return c == " "
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# hierarchy of lines > words > glyphs.
def _words(s):
"""Split words based on spaces and include spaces as 'words'"""
spaces = None
word = ''
for c in s:
if c == ' ' and spaces is True:
word += c
elif c != ' ' and spaces is False:
word += c
else:
if word:
yield word
word = c
spaces = (c == ' ')
yield word
word_no = 0
i = 0
for word_text in uniseg.wordbreak.words(prediction.sentence):
# XXX Re-use word segmentation from dinglehopper, i.e. support private use characters
word_length = len(word_text)
do_not_include = all(unwanted(c) for c in word_text)
if not do_not_include:
for word_text in _words(prediction.sentence):
word_length = len(word_text)
if not all(c == ' ' for c in word_text):
word_positions = prediction.positions[i:i+word_length]
word_start = word_positions[0].global_start
word_end = word_positions[-1].global_end
@ -152,10 +157,9 @@ class CalamariRecognize(Processor):
word.add_Glyph(glyph)
line.add_Word(word)
word_no += 1
i += word_length
word_no += 1
_page_update_higher_textequiv_levels('line', pcgts)

@ -4,4 +4,3 @@ calamari-ocr == 0.3.5
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click
ocrd >= 2.2.1
uniseg

Loading…
Cancel
Save