mirror of
https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
synced 2025-06-09 03:40:06 +02:00
relax concatenation test: text must be equal irrespective of tokenization/joiner
This commit is contained in:
parent
6ee105b17c
commit
f829015bb5
1 changed files with 5 additions and 3 deletions
|
@ -126,11 +126,12 @@ def _fix_words(line, page_id, reverse=False):
|
|||
key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
|
||||
sorted_words_text = get_text(sorted_words, ' ')
|
||||
|
||||
if sorted_words_text == line_text:
|
||||
if (sorted_words_text == line_text or
|
||||
sorted_words_text.replace(' ', '') == line_text.replace(' ', '')):
|
||||
LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
|
||||
line.set_Word(sorted_words)
|
||||
else:
|
||||
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
||||
LOG.debug('Resorting lines of page "%s" line "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
||||
page_id, line.id,
|
||||
str([word.id for word in words]),
|
||||
str([word.id for word in sorted_words]),
|
||||
|
@ -174,7 +175,8 @@ def _fix_lines(region, page_id, reverse=False):
|
|||
key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
|
||||
sorted_lines_text = get_text(sorted_lines, '\n')
|
||||
|
||||
if sorted_lines_text == region_text:
|
||||
if (sorted_lines_text == region_text or
|
||||
sorted_lines_text.replace('\n', '') == region_text.replace('\n', '')):
|
||||
LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
|
||||
region.set_TextLine(sorted_lines)
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue