Merge pull request #4 from bertsky/all-orders

All orders
pull/10/head
Mike Gerber 5 years ago committed by GitHub
commit 6ee105b17c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,7 +1,8 @@
import click import click
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_repair_inconsistencies.ocrd_repair_inconsistencies import RepairInconsistencies
from .ocrd_repair_inconsistencies import RepairInconsistencies
@click.command() @click.command()

@ -37,22 +37,51 @@ class RepairInconsistencies(Processor):
regions = page.get_TextRegion() regions = page.get_TextRegion()
for region in regions: for region in regions:
if region.readingDirection != 'left-to-right': textLineOrder = 'top-to-bottom'
LOG.info('Not processing region "%s" (not left-to-right)', region.id) for segment in [region, page]:
if segment.textLineOrder is None:
continue continue
if len(region.get_TextLine()) > 1 and region.textLineOrder != 'top-to-bottom': else:
LOG.info('Not processing region "%s" (not top-to-bottom)', region.id) textLineOrder = segment.textLineOrder
break
if textLineOrder not in ['top-to-bottom', 'bottom-to-top']:
LOG.info('Not processing page "%s" region "%s" (textLineOrder=%s)',
page_id, region.id, textLineOrder)
continue continue
_fix_lines(region) _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top')
lines = region.get_TextLine() lines = region.get_TextLine()
for line in lines: for line in lines:
_fix_words(line) readingDirection = 'left-to-right'
for segment in [line, region, page]:
if segment.readingDirection is None:
continue
else:
readingDirection = segment.readingDirection
break
if readingDirection not in ['left-to-right', 'right-to-left']:
LOG.info('Not processing page "%s" line "%s" (readingDirection=%s)',
page_id, line.id, readingDirection)
continue
_fix_words(line, page_id, reverse=readingDirection=='right-to-left')
words = line.get_Word() words = line.get_Word()
for word in words: for word in words:
_fix_glyphs(word) readingDirection = 'left-to-right'
for segment in [word, line, region, page]:
if segment.readingDirection is None:
continue
else:
readingDirection = segment.readingDirection
break
if readingDirection not in ['left-to-right', 'right-to-left']:
LOG.info('Not processing page "%s" word "%s" (readingDirection=%s)',
page_id, word.id, readingDirection)
continue
_fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left')
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID: if file_id == input_file.ID:
@ -84,46 +113,73 @@ def get_text(thing, joiner=None):
return text return text
def _fix_words(line): def _fix_words(line, page_id, reverse=False):
"""Fix word order in a line""" """Fix word order in a line"""
words = line.get_Word() words = line.get_Word()
if not words:
return
line_text = get_text(line) line_text = get_text(line)
words_text = get_text(words, ' ') words_text = get_text(words, ' ')
if line_text != words_text: if line_text != words_text:
sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) sorted_words = sorted(words, reverse=reverse,
key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
sorted_words_text = get_text(sorted_words, ' ') sorted_words_text = get_text(sorted_words, ' ')
if sorted_words_text == line_text: if sorted_words_text == line_text:
LOG.info('Fixing word order of line "%s"', line.id) LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
line.set_Word(sorted_words) line.set_Word(sorted_words)
else:
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
page_id, line.id,
str([word.id for word in words]),
str([word.id for word in sorted_words]),
words_text, line_text)
def _fix_glyphs(word): def _fix_glyphs(word, page_id, reverse=False):
"""Fix glyph order in a word""" """Fix glyph order in a word"""
glyphs = word.get_Glyph() glyphs = word.get_Glyph()
if not glyphs:
return
word_text = get_text(word) word_text = get_text(word)
glyphs_text = get_text(glyphs, '') glyphs_text = get_text(glyphs, '')
if word_text != glyphs_text: if word_text != glyphs_text:
sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) sorted_glyphs = sorted(glyphs, reverse=reverse,
key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
sorted_glyphs_text = get_text(sorted_glyphs, '') sorted_glyphs_text = get_text(sorted_glyphs, '')
if sorted_glyphs_text == word_text: if sorted_glyphs_text == word_text:
LOG.info('Fixing glyph order of word "%s"', word.id) LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
word.set_Glyph(sorted_glyphs) word.set_Glyph(sorted_glyphs)
else:
LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"',
page_id, word.id,
str([glyph.id for glyph in glyphs]),
str([glyph.id for glyph in sorted_glyphs]),
glyphs_text, word_text)
def _fix_lines(region): def _fix_lines(region, page_id, reverse=False):
"""Fix line order in a region""" """Fix line order in a region"""
lines = region.get_TextLine() lines = region.get_TextLine()
if not lines:
return
region_text = get_text(region) region_text = get_text(region)
lines_text = get_text(lines, '\n') lines_text = get_text(lines, '\n')
if region_text != lines_text: if region_text != lines_text:
sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) sorted_lines = sorted(lines, reverse=reverse,
key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
sorted_lines_text = get_text(sorted_lines, '\n') sorted_lines_text = get_text(sorted_lines, '\n')
if sorted_lines_text == region_text: if sorted_lines_text == region_text:
LOG.info('Fixing line order of region "%s"', region.id) LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
region.set_TextLine(sorted_lines) region.set_TextLine(sorted_lines)
else:
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
page_id, region.id,
str([line.id for line in lines]),
str([line.id for line in sorted_lines]),
lines_text, region_text)

Loading…
Cancel
Save