Do word segmentation as expected by OCR-D PAGE specs

fix/readme-no-checkpoint
Gerber, Mike 5 years ago
parent 0f9c94e7dc
commit 6f4736f8e4

@ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta
image) as its input. image) as its input.
In addition to the line text it also outputs glyph segmentation including In addition to the line text it also outputs glyph segmentation including
per-glyph confidence values and per-glyph alternative predictions as provided per-glyph confidence values and per-glyph alternative predictions as provided by
by the Calamari OCR engine. Note that while Calamari does not provide word the Calamari OCR engine. Note that while Calamari does not provide word
segmentation, this processor produces word segmentation inferred from Unicode segmentation, this processor produces word segmentation inferred from text
text segmentation and the glyph positions. The provided glyph and word segmentation and the glyph positions. The provided glyph and word segmentation
segmentation can be used for text extraction and highlighting, but is probably can be used for text extraction and highlighting, but is probably not useful for
not useful for further image-based processing. further image-based processing.
## Installation ## Installation

@ -4,7 +4,6 @@ import os
from glob import glob from glob import glob
import numpy as np import numpy as np
import uniseg.wordbreak
from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams from calamari_ocr.proto import VoterParams
@ -101,26 +100,32 @@ class CalamariRecognize(Processor):
# Save word results # Save word results
# #
# Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# a strict hierarchy of lines > words > glyphs. # hierarchy of lines > words > glyphs.
def unwanted(c): def _words(s):
""" """Split words based on spaces and include spaces as 'words'"""
Define unwanted characters spaces = None
word = ''
Words only containing these e.g. whitespace characters are not considered as words. for c in s:
""" if c == ' ' and spaces is True:
return c == " " word += c
elif c != ' ' and spaces is False:
word += c
else:
if word:
yield word
word = c
spaces = (c == ' ')
yield word
word_no = 0 word_no = 0
i = 0 i = 0
for word_text in uniseg.wordbreak.words(prediction.sentence):
# XXX Re-use word segmentation from dinglehopper, i.e. support private use characters
word_length = len(word_text)
do_not_include = all(unwanted(c) for c in word_text)
if not do_not_include: for word_text in _words(prediction.sentence):
word_length = len(word_text)
if not all(c == ' ' for c in word_text):
word_positions = prediction.positions[i:i+word_length] word_positions = prediction.positions[i:i+word_length]
word_start = word_positions[0].global_start word_start = word_positions[0].global_start
word_end = word_positions[-1].global_end word_end = word_positions[-1].global_end
@ -152,10 +157,9 @@ class CalamariRecognize(Processor):
word.add_Glyph(glyph) word.add_Glyph(glyph)
line.add_Word(word) line.add_Word(word)
word_no += 1
i += word_length i += word_length
word_no += 1
_page_update_higher_textequiv_levels('line', pcgts) _page_update_higher_textequiv_levels('line', pcgts)

@ -4,4 +4,3 @@ calamari-ocr == 0.3.5
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click click
ocrd >= 2.2.1 ocrd >= 2.2.1
uniseg

Loading…
Cancel
Save