From 295165692b8d24bc4160da6bea670ef7b3db05c8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 12:16:04 +0100 Subject: [PATCH 1/2] fix module tree --- ocrd_repair_inconsistencies/__init__.py | 0 ocrd_repair_inconsistencies/cli.py | 3 ++- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 ocrd_repair_inconsistencies/__init__.py diff --git a/ocrd_repair_inconsistencies/__init__.py b/ocrd_repair_inconsistencies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocrd_repair_inconsistencies/cli.py b/ocrd_repair_inconsistencies/cli.py index b684068..b2513fe 100644 --- a/ocrd_repair_inconsistencies/cli.py +++ b/ocrd_repair_inconsistencies/cli.py @@ -1,7 +1,8 @@ import click from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_repair_inconsistencies.ocrd_repair_inconsistencies import RepairInconsistencies + +from .ocrd_repair_inconsistencies import RepairInconsistencies @click.command() From 0dc5bdac2e034eefe893503b6e2dc2578ac93e74 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 12:31:08 +0100 Subject: [PATCH 2/2] generalize to other textLineOrder/readingDirection: - don't ignore regions / lines / words that are not top-to-bottom and left-to-right; instead, only ignore regions that are not top-to-bottom OR bottom-to-top and lines or words that are not left-to-right OR right-to-left (thus, applying each on its appropriate level, and allowing reverse sorting, but still discounting rotated layouts) - don't enter segments if they have no more than 1 child - improve logging: show failed attempts on debug, show pageIds throughout --- .../ocrd_repair_inconsistencies.py | 90 +++++++++++++++---- 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index 78a98e4..282a070 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -37,22 +37,51 @@ class RepairInconsistencies(Processor): regions = page.get_TextRegion() for region in regions: - if region.readingDirection != 'left-to-right': - LOG.info('Not processing region "%s" (not left-to-right)', region.id) - continue - if len(region.get_TextLine()) > 1 and region.textLineOrder != 'top-to-bottom': - LOG.info('Not processing region "%s" (not top-to-bottom)', region.id) + textLineOrder = 'top-to-bottom' + for segment in [region, page]: + if segment.textLineOrder is None: + continue + else: + textLineOrder = segment.textLineOrder + break + if textLineOrder not in ['top-to-bottom', 'bottom-to-top']: + LOG.info('Not processing page "%s" region "%s" (textLineOrder=%s)', + page_id, region.id, textLineOrder) continue - _fix_lines(region) + _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top') lines = region.get_TextLine() for line in lines: - _fix_words(line) + readingDirection = 'left-to-right' + for segment in [line, region, page]: + if segment.readingDirection is None: + continue + else: + readingDirection = segment.readingDirection + break + if readingDirection not in ['left-to-right', 'right-to-left']: + LOG.info('Not processing page "%s" line "%s" (readingDirection=%s)', + page_id, line.id, readingDirection) + continue + + _fix_words(line, page_id, reverse=readingDirection=='right-to-left') words = line.get_Word() for word in words: - _fix_glyphs(word) + readingDirection = 'left-to-right' + for segment in [word, line, region, page]: + if segment.readingDirection is None: + continue + else: + readingDirection = segment.readingDirection + break + if readingDirection not in ['left-to-right', 'right-to-left']: + LOG.info('Not processing page "%s" word "%s" (readingDirection=%s)', + page_id, word.id, readingDirection) + continue + + _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left') file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: @@ -84,46 +113,73 @@ def get_text(thing, joiner=None): return text -def _fix_words(line): +def _fix_words(line, page_id, reverse=False): """Fix word order in a line""" words = line.get_Word() + if not words: + return line_text = get_text(line) words_text = get_text(words, ' ') if line_text != words_text: - sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) + sorted_words = sorted(words, reverse=reverse, + key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) sorted_words_text = get_text(sorted_words, ' ') if sorted_words_text == line_text: - LOG.info('Fixing word order of line "%s"', line.id) + LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) line.set_Word(sorted_words) + else: + LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', + page_id, line.id, + str([word.id for word in words]), + str([word.id for word in sorted_words]), + words_text, line_text) -def _fix_glyphs(word): +def _fix_glyphs(word, page_id, reverse=False): """Fix glyph order in a word""" glyphs = word.get_Glyph() + if not glyphs: + return word_text = get_text(word) glyphs_text = get_text(glyphs, '') if word_text != glyphs_text: - sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) + sorted_glyphs = sorted(glyphs, reverse=reverse, + key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) sorted_glyphs_text = get_text(sorted_glyphs, '') if sorted_glyphs_text == word_text: - LOG.info('Fixing glyph order of word "%s"', word.id) + LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id) word.set_Glyph(sorted_glyphs) + else: + LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"', + page_id, word.id, + str([glyph.id for glyph in glyphs]), + str([glyph.id for glyph in sorted_glyphs]), + glyphs_text, word_text) -def _fix_lines(region): +def _fix_lines(region, page_id, reverse=False): """Fix line order in a region""" lines = region.get_TextLine() + if not lines: + return region_text = get_text(region) lines_text = get_text(lines, '\n') if region_text != lines_text: - sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) + sorted_lines = sorted(lines, reverse=reverse, + key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) sorted_lines_text = get_text(sorted_lines, '\n') if sorted_lines_text == region_text: - LOG.info('Fixing line order of region "%s"', region.id) + LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) region.set_TextLine(sorted_lines) + else: + LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', + page_id, region.id, + str([line.id for line in lines]), + str([line.id for line in sorted_lines]), + lines_text, region_text)