1
0
Fork 0
mirror of https://github.com/mikegerber/ocrd_calamari.git synced 2025-06-26 03:59:53 +02:00

Do word segmentation as expected by OCR-D PAGE specs

This commit is contained in:
Gerber, Mike 2020-02-03 19:10:16 +01:00
parent 0f9c94e7dc
commit 6f4736f8e4
3 changed files with 28 additions and 25 deletions

View file

@ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta
image) as its input.
In addition to the line text it also outputs glyph segmentation including
per-glyph confidence values and per-glyph alternative predictions as provided
by the Calamari OCR engine. Note that while Calamari does not provide word
segmentation, this processor produces word segmentation inferred from Unicode
text segmentation and the glyph positions. The provided glyph and word
segmentation can be used for text extraction and highlighting, but is probably
not useful for further image-based processing.
per-glyph confidence values and per-glyph alternative predictions as provided by
the Calamari OCR engine. Note that while Calamari does not provide word
segmentation, this processor produces word segmentation inferred from text
segmentation and the glyph positions. The provided glyph and word segmentation
can be used for text extraction and highlighting, but is probably not useful for
further image-based processing.
## Installation

View file

@ -4,7 +4,6 @@ import os
from glob import glob
import numpy as np
import uniseg.wordbreak
from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams
@ -101,26 +100,32 @@ class CalamariRecognize(Processor):
# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
# segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
# a strict hierarchy of lines > words > glyphs.
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# hierarchy of lines > words > glyphs.
def unwanted(c):
"""
Define unwanted characters
Words only containing these e.g. whitespace characters are not considered as words.
"""
return c == " "
def _words(s):
"""Split words based on spaces and include spaces as 'words'"""
spaces = None
word = ''
for c in s:
if c == ' ' and spaces is True:
word += c
elif c != ' ' and spaces is False:
word += c
else:
if word:
yield word
word = c
spaces = (c == ' ')
yield word
word_no = 0
i = 0
for word_text in uniseg.wordbreak.words(prediction.sentence):
# XXX Re-use word segmentation from dinglehopper, i.e. support private use characters
word_length = len(word_text)
do_not_include = all(unwanted(c) for c in word_text)
if not do_not_include:
for word_text in _words(prediction.sentence):
word_length = len(word_text)
if not all(c == ' ' for c in word_text):
word_positions = prediction.positions[i:i+word_length]
word_start = word_positions[0].global_start
word_end = word_positions[-1].global_end
@ -152,10 +157,9 @@ class CalamariRecognize(Processor):
word.add_Glyph(glyph)
line.add_Word(word)
word_no += 1
i += word_length
word_no += 1
_page_update_higher_textequiv_levels('line', pcgts)

View file

@ -4,4 +4,3 @@ calamari-ocr == 0.3.5
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click
ocrd >= 2.2.1
uniseg