1
0
Fork 0
mirror of https://github.com/mikegerber/ocrd_calamari.git synced 2025-06-10 04:09:53 +02:00

Do word segmentation as expected by OCR-D PAGE specs

This commit is contained in:
Gerber, Mike 2020-02-03 19:10:16 +01:00
parent 0f9c94e7dc
commit 6f4736f8e4
3 changed files with 28 additions and 25 deletions

View file

@ -4,7 +4,6 @@ import os
from glob import glob
import numpy as np
import uniseg.wordbreak
from calamari_ocr.ocr import MultiPredictor
from calamari_ocr.ocr.voting import voter_from_proto
from calamari_ocr.proto import VoterParams
@ -101,26 +100,32 @@ class CalamariRecognize(Processor):
# Save word results
#
# Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text
# segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces
# a strict hierarchy of lines > words > glyphs.
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
# and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
# hierarchy of lines > words > glyphs.
def unwanted(c):
"""
Define unwanted characters
Words only containing these e.g. whitespace characters are not considered as words.
"""
return c == " "
def _words(s):
"""Split words based on spaces and include spaces as 'words'"""
spaces = None
word = ''
for c in s:
if c == ' ' and spaces is True:
word += c
elif c != ' ' and spaces is False:
word += c
else:
if word:
yield word
word = c
spaces = (c == ' ')
yield word
word_no = 0
i = 0
for word_text in uniseg.wordbreak.words(prediction.sentence):
# XXX Re-use word segmentation from dinglehopper, i.e. support private use characters
word_length = len(word_text)
do_not_include = all(unwanted(c) for c in word_text)
if not do_not_include:
for word_text in _words(prediction.sentence):
word_length = len(word_text)
if not all(c == ' ' for c in word_text):
word_positions = prediction.positions[i:i+word_length]
word_start = word_positions[0].global_start
word_end = word_positions[-1].global_end
@ -152,10 +157,9 @@ class CalamariRecognize(Processor):
word.add_Glyph(glyph)
line.add_Word(word)
word_no += 1
i += word_length
word_no += 1
_page_update_higher_textequiv_levels('line', pcgts)