mirror of
https://github.com/mikegerber/ocrd_calamari.git
synced 2025-06-11 04:39:53 +02:00
🐛 Build line text on our own
Calamari does whitespace post-processing on prediction.sentence, while it does not do the same on prediction.positions. Do it on our own to have consistency. Fixes GH-37.
This commit is contained in:
parent
30f7e1b246
commit
5b6d8b3f41
1 changed files with 40 additions and 4 deletions
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import itertools
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -84,8 +85,39 @@ class CalamariRecognize(Processor):
|
||||||
prediction = self.voter.vote_prediction_result(raw_results)
|
prediction = self.voter.vote_prediction_result(raw_results)
|
||||||
prediction.id = "voted"
|
prediction.id = "voted"
|
||||||
|
|
||||||
line_text = prediction.sentence
|
# Build line text on our own
|
||||||
line_conf = prediction.avg_char_probability
|
#
|
||||||
|
# Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
|
||||||
|
# on prediction.positions. Do it on our own to have consistency.
|
||||||
|
#
|
||||||
|
# XXX Check Calamari's built-in post-processing on prediction.sentence
|
||||||
|
|
||||||
|
def _drop_leading_spaces(positions):
|
||||||
|
return list(itertools.dropwhile(lambda p: p.chars[0].char == " ", positions))
|
||||||
|
def _drop_trailing_spaces(positions):
|
||||||
|
return list(reversed(_drop_leading_spaces(reversed(positions))))
|
||||||
|
def _drop_double_spaces(positions):
|
||||||
|
def _drop_double_spaces_generator(positions):
|
||||||
|
last_was_space = False
|
||||||
|
for p in positions:
|
||||||
|
if p.chars[0].char == " ":
|
||||||
|
if not last_was_space:
|
||||||
|
yield p
|
||||||
|
last_was_space = True
|
||||||
|
else:
|
||||||
|
yield p
|
||||||
|
last_was_space = False
|
||||||
|
return list(_drop_double_spaces_generator(positions))
|
||||||
|
positions = prediction.positions
|
||||||
|
positions = _drop_leading_spaces(positions)
|
||||||
|
positions = _drop_trailing_spaces(positions)
|
||||||
|
positions = _drop_double_spaces(positions)
|
||||||
|
positions = list(positions)
|
||||||
|
|
||||||
|
line_text = ''.join(p.chars[0].char for p in positions)
|
||||||
|
if line_text != prediction.sentence:
|
||||||
|
log.warning("Our own line text is not the same as Calamari's: '%s' != '%s'",
|
||||||
|
line_text, prediction.sentence)
|
||||||
|
|
||||||
# Delete existing results
|
# Delete existing results
|
||||||
if line.get_TextEquiv():
|
if line.get_TextEquiv():
|
||||||
|
@ -96,8 +128,10 @@ class CalamariRecognize(Processor):
|
||||||
line.set_Word([])
|
line.set_Word([])
|
||||||
|
|
||||||
# Save line results
|
# Save line results
|
||||||
|
line_conf = prediction.avg_char_probability
|
||||||
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
|
line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])
|
||||||
|
|
||||||
|
|
||||||
# Save word results
|
# Save word results
|
||||||
#
|
#
|
||||||
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
|
# Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
|
||||||
|
@ -124,10 +158,12 @@ class CalamariRecognize(Processor):
|
||||||
word_no = 0
|
word_no = 0
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
for word_text in _words(prediction.sentence):
|
|
||||||
|
|
||||||
|
for word_text in _words(line_text):
|
||||||
word_length = len(word_text)
|
word_length = len(word_text)
|
||||||
if not all(c == ' ' for c in word_text):
|
if not all(c == ' ' for c in word_text):
|
||||||
word_positions = prediction.positions[i:i+word_length]
|
word_positions = positions[i:i+word_length]
|
||||||
word_start = word_positions[0].global_start
|
word_start = word_positions[0].global_start
|
||||||
word_end = word_positions[-1].global_end
|
word_end = word_positions[-1].global_end
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue