Include proper word + glyph segmentation

fix/readme-no-checkpoint
Gerber, Mike 5 years ago
parent 24532f693a
commit 507bc1ce5e

@ -4,6 +4,7 @@ import os
from glob import glob from glob import glob
import numpy as np import numpy as np
import uniseg.wordbreak
from calamari_ocr.ocr import MultiPredictor from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams from calamari_ocr.proto import VoterParams
@ -13,7 +14,7 @@ from ocrd_models.ocrd_page import (
LabelType, LabelsType, LabelType, LabelsType,
MetadataItemType, MetadataItemType,
TextEquivType, TextEquivType,
WordType, CoordsType, WordType, GlyphType, CoordsType,
to_xml to_xml
) )
from ocrd_utils import getLogger, concat_padded, coordinates_for_segment, points_from_polygon, MIMETYPE_PAGE from ocrd_utils import getLogger, concat_padded, coordinates_for_segment, points_from_polygon, MIMETYPE_PAGE
@ -95,28 +96,53 @@ class CalamariRecognize(Processor):
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
# Save word results # Save word results
# XXX For early development just put every char = glyph into its own word def unwanted(c):
for word_no, p in enumerate(prediction.positions): return c == " "
start = p.global_start
end = p.global_end
word_no = 0
i = 0
for word_text in uniseg.wordbreak.words(prediction.sentence):
print(word_text)
word_length = len(word_text)
do_not_include = all(unwanted(c) for c in word_text)
# XXX Maybe use version in ocrd_tesserocr if not do_not_include:
h = line_image.height word_positions = prediction.positions[i:i+word_length]
polygon = [(start, 0), (end, 0), (end, h), (start, h)] word_start = word_positions[0].global_start
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords)) word_end = word_positions[-1].global_end
word = WordType( # XXX Maybe use version in ocrd_tesserocr
id='%s_word%04d' % (line.id, word_no), h = line_image.height
Coords=CoordsType(points)) polygon = [(word_start, 0), (word_end, 0), (word_end, h), (word_start, h)]
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
chars = sorted(p.chars, key=lambda k: k.probability, reverse=True) word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points))
for index, char in enumerate(chars): word.add_TextEquiv(TextEquivType(Unicode=word_text))
if char.char:
word.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability)) for glyph_no, p in enumerate(word_positions):
# XXX Note that omission probabilities are not normalized?! glyph_start = p.global_start
glyph_end = p.global_end
# XXX Maybe use version in ocrd_tesserocr
h = line_image.height
polygon = [(glyph_start, 0), (glyph_end, 0), (glyph_end, h), (glyph_start, h)]
points = points_from_polygon(coordinates_for_segment(polygon, None, line_coords))
glyph = GlyphType(id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points))
chars = sorted(p.chars, key=lambda k: k.probability, reverse=True)
for index, char in enumerate(chars):
if char.char:
glyph.add_TextEquiv(TextEquivType(Unicode=char.char, index=index, conf=char.probability))
# XXX Note that omission probabilities are not normalized?!
word.add_Glyph(glyph)
line.add_Word(word)
i += word_length
word_no += 1
line.add_Word(word)
_page_update_higher_textequiv_levels('line', pcgts) _page_update_higher_textequiv_levels('line', pcgts)

@ -4,3 +4,4 @@ calamari-ocr == 0.3.5
setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime? setuptools >= 41.0.0 # tensorboard depends on this, but why do we get an error at runtime?
click click
ocrd >= 2.2.1 ocrd >= 2.2.1
uniseg

Loading…
Cancel
Save