mirror of
https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
synced 2025-06-09 03:40:06 +02:00
generalize to other textLineOrder/readingDirection:
- don't ignore regions / lines / words that are not top-to-bottom and left-to-right; instead, only ignore regions that are not top-to-bottom OR bottom-to-top and lines or words that are not left-to-right OR right-to-left (thus, applying each on its appropriate level, and allowing reverse sorting, but still discounting rotated layouts) - don't enter segments if they have no more than 1 child - improve logging: show failed attempts on debug, show pageIds throughout
This commit is contained in:
parent
295165692b
commit
0dc5bdac2e
1 changed files with 73 additions and 17 deletions
|
@ -37,22 +37,51 @@ class RepairInconsistencies(Processor):
|
||||||
regions = page.get_TextRegion()
|
regions = page.get_TextRegion()
|
||||||
|
|
||||||
for region in regions:
|
for region in regions:
|
||||||
if region.readingDirection != 'left-to-right':
|
textLineOrder = 'top-to-bottom'
|
||||||
LOG.info('Not processing region "%s" (not left-to-right)', region.id)
|
for segment in [region, page]:
|
||||||
continue
|
if segment.textLineOrder is None:
|
||||||
if len(region.get_TextLine()) > 1 and region.textLineOrder != 'top-to-bottom':
|
continue
|
||||||
LOG.info('Not processing region "%s" (not top-to-bottom)', region.id)
|
else:
|
||||||
|
textLineOrder = segment.textLineOrder
|
||||||
|
break
|
||||||
|
if textLineOrder not in ['top-to-bottom', 'bottom-to-top']:
|
||||||
|
LOG.info('Not processing page "%s" region "%s" (textLineOrder=%s)',
|
||||||
|
page_id, region.id, textLineOrder)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
_fix_lines(region)
|
_fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top')
|
||||||
|
|
||||||
lines = region.get_TextLine()
|
lines = region.get_TextLine()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
_fix_words(line)
|
readingDirection = 'left-to-right'
|
||||||
|
for segment in [line, region, page]:
|
||||||
|
if segment.readingDirection is None:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
readingDirection = segment.readingDirection
|
||||||
|
break
|
||||||
|
if readingDirection not in ['left-to-right', 'right-to-left']:
|
||||||
|
LOG.info('Not processing page "%s" line "%s" (readingDirection=%s)',
|
||||||
|
page_id, line.id, readingDirection)
|
||||||
|
continue
|
||||||
|
|
||||||
|
_fix_words(line, page_id, reverse=readingDirection=='right-to-left')
|
||||||
|
|
||||||
words = line.get_Word()
|
words = line.get_Word()
|
||||||
for word in words:
|
for word in words:
|
||||||
_fix_glyphs(word)
|
readingDirection = 'left-to-right'
|
||||||
|
for segment in [word, line, region, page]:
|
||||||
|
if segment.readingDirection is None:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
readingDirection = segment.readingDirection
|
||||||
|
break
|
||||||
|
if readingDirection not in ['left-to-right', 'right-to-left']:
|
||||||
|
LOG.info('Not processing page "%s" word "%s" (readingDirection=%s)',
|
||||||
|
page_id, word.id, readingDirection)
|
||||||
|
continue
|
||||||
|
|
||||||
|
_fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left')
|
||||||
|
|
||||||
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
||||||
if file_id == input_file.ID:
|
if file_id == input_file.ID:
|
||||||
|
@ -84,46 +113,73 @@ def get_text(thing, joiner=None):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def _fix_words(line):
|
def _fix_words(line, page_id, reverse=False):
|
||||||
"""Fix word order in a line"""
|
"""Fix word order in a line"""
|
||||||
|
|
||||||
words = line.get_Word()
|
words = line.get_Word()
|
||||||
|
if not words:
|
||||||
|
return
|
||||||
line_text = get_text(line)
|
line_text = get_text(line)
|
||||||
words_text = get_text(words, ' ')
|
words_text = get_text(words, ' ')
|
||||||
if line_text != words_text:
|
if line_text != words_text:
|
||||||
sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
|
sorted_words = sorted(words, reverse=reverse,
|
||||||
|
key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
|
||||||
sorted_words_text = get_text(sorted_words, ' ')
|
sorted_words_text = get_text(sorted_words, ' ')
|
||||||
|
|
||||||
if sorted_words_text == line_text:
|
if sorted_words_text == line_text:
|
||||||
LOG.info('Fixing word order of line "%s"', line.id)
|
LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
|
||||||
line.set_Word(sorted_words)
|
line.set_Word(sorted_words)
|
||||||
|
else:
|
||||||
|
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
||||||
|
page_id, line.id,
|
||||||
|
str([word.id for word in words]),
|
||||||
|
str([word.id for word in sorted_words]),
|
||||||
|
words_text, line_text)
|
||||||
|
|
||||||
|
|
||||||
def _fix_glyphs(word):
|
def _fix_glyphs(word, page_id, reverse=False):
|
||||||
"""Fix glyph order in a word"""
|
"""Fix glyph order in a word"""
|
||||||
|
|
||||||
glyphs = word.get_Glyph()
|
glyphs = word.get_Glyph()
|
||||||
|
if not glyphs:
|
||||||
|
return
|
||||||
word_text = get_text(word)
|
word_text = get_text(word)
|
||||||
glyphs_text = get_text(glyphs, '')
|
glyphs_text = get_text(glyphs, '')
|
||||||
if word_text != glyphs_text:
|
if word_text != glyphs_text:
|
||||||
sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
|
sorted_glyphs = sorted(glyphs, reverse=reverse,
|
||||||
|
key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
|
||||||
sorted_glyphs_text = get_text(sorted_glyphs, '')
|
sorted_glyphs_text = get_text(sorted_glyphs, '')
|
||||||
|
|
||||||
if sorted_glyphs_text == word_text:
|
if sorted_glyphs_text == word_text:
|
||||||
LOG.info('Fixing glyph order of word "%s"', word.id)
|
LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
|
||||||
word.set_Glyph(sorted_glyphs)
|
word.set_Glyph(sorted_glyphs)
|
||||||
|
else:
|
||||||
|
LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
||||||
|
page_id, word.id,
|
||||||
|
str([glyph.id for glyph in glyphs]),
|
||||||
|
str([glyph.id for glyph in sorted_glyphs]),
|
||||||
|
glyphs_text, word_text)
|
||||||
|
|
||||||
|
|
||||||
def _fix_lines(region):
|
def _fix_lines(region, page_id, reverse=False):
|
||||||
"""Fix line order in a region"""
|
"""Fix line order in a region"""
|
||||||
|
|
||||||
lines = region.get_TextLine()
|
lines = region.get_TextLine()
|
||||||
|
if not lines:
|
||||||
|
return
|
||||||
region_text = get_text(region)
|
region_text = get_text(region)
|
||||||
lines_text = get_text(lines, '\n')
|
lines_text = get_text(lines, '\n')
|
||||||
if region_text != lines_text:
|
if region_text != lines_text:
|
||||||
sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
|
sorted_lines = sorted(lines, reverse=reverse,
|
||||||
|
key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
|
||||||
sorted_lines_text = get_text(sorted_lines, '\n')
|
sorted_lines_text = get_text(sorted_lines, '\n')
|
||||||
|
|
||||||
if sorted_lines_text == region_text:
|
if sorted_lines_text == region_text:
|
||||||
LOG.info('Fixing line order of region "%s"', region.id)
|
LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
|
||||||
region.set_TextLine(sorted_lines)
|
region.set_TextLine(sorted_lines)
|
||||||
|
else:
|
||||||
|
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
||||||
|
page_id, region.id,
|
||||||
|
str([line.id for line in lines]),
|
||||||
|
str([line.id for line in sorted_lines]),
|
||||||
|
lines_text, region_text)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue