From 9002606e1c119d0fd55ea79ea4a0c43f17d4303f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 17:46:16 +0100 Subject: [PATCH] unify function for 3 levels --- .../ocrd_repair_inconsistencies.py | 120 +++++++----------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index 50b26ef..bca6888 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -6,6 +6,7 @@ from collections import Sequence from ocrd import Processor from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + TextRegionType, TextLineType, WordType, to_xml ) from ocrd_utils import ( @@ -49,7 +50,7 @@ class RepairInconsistencies(Processor): page_id, region.id, textLineOrder) continue - _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top') + _fix_segment(region, page_id, reverse=textLineOrder=='bottom-to-top') lines = region.get_TextLine() for line in lines: @@ -65,7 +66,7 @@ class RepairInconsistencies(Processor): page_id, line.id, readingDirection) continue - _fix_words(line, page_id, reverse=readingDirection=='right-to-left') + _fix_segment(line, page_id, reverse=readingDirection=='right-to-left') words = line.get_Word() for word in words: @@ -81,7 +82,7 @@ class RepairInconsistencies(Processor): page_id, word.id, readingDirection) continue - _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left') + _fix_segment(word, page_id, reverse=readingDirection=='right-to-left') file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: @@ -113,75 +114,48 @@ def get_text(thing, joiner=None): return text -def _fix_words(line, page_id, reverse=False): - """Fix word order in a line""" - - words = line.get_Word() - if not words: - return - line_text = get_text(line) - words_text = get_text(words, ' ') - if line_text != words_text: - sorted_words = sorted(words, reverse=reverse, - key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) - sorted_words_text = get_text(sorted_words, ' ') - - if (sorted_words_text == line_text or - sorted_words_text.replace(' ', '') == line_text.replace(' ', '')): - LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) - line.set_Word(sorted_words) - else: - LOG.debug('Resorting lines of page "%s" line "%s" from %s to %s does not suffice to turn "%s" into "%s"', - page_id, line.id, - str([word.id for word in words]), - str([word.id for word in sorted_words]), - words_text, line_text) - - -def _fix_glyphs(word, page_id, reverse=False): - """Fix glyph order in a word""" - - glyphs = word.get_Glyph() - if not glyphs: - return - word_text = get_text(word) - glyphs_text = get_text(glyphs, '') - if word_text != glyphs_text: - sorted_glyphs = sorted(glyphs, reverse=reverse, - key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) - sorted_glyphs_text = get_text(sorted_glyphs, '') - - if sorted_glyphs_text == word_text: - LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id) - word.set_Glyph(sorted_glyphs) - else: - LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"', - page_id, word.id, - str([glyph.id for glyph in glyphs]), - str([glyph.id for glyph in sorted_glyphs]), - glyphs_text, word_text) - - -def _fix_lines(region, page_id, reverse=False): - """Fix line order in a region""" - - lines = region.get_TextLine() - if not lines: +def _fix_segment(segment, page_id, reverse=False): + """Fix order of child elements of (region/line/word) segment.""" + + if isinstance(segment, TextRegionType): + joiner = '\n' + sort_horizontal = False + children = segment.get_TextLine() + adoption = segment.set_TextLine + elif isinstance(segment, TextLineType): + joiner = ' ' + sort_horizontal = True + children = segment.get_Word() + adoption = segment.set_Word + elif isinstance(segment, WordType): + joiner = '' + sort_horizontal = True + children = segment.get_Glyph() + adoption = segment.set_Glyph + else: + raise Exception('invalid element type %s of segment to fix' % type(segment)) + if not children: return - region_text = get_text(region) - lines_text = get_text(lines, '\n') - if region_text != lines_text: - sorted_lines = sorted(lines, reverse=reverse, - key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) - sorted_lines_text = get_text(sorted_lines, '\n') - - if (sorted_lines_text == region_text or - sorted_lines_text.replace('\n', '') == region_text.replace('\n', '')): - LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) - region.set_TextLine(sorted_lines) + segment_text = get_text(segment) + concat_text = get_text(children, joiner) + if (segment_text != concat_text and + segment_text.replace(joiner, '') != concat_text.replace(joiner, '')): + def polygon_position(child, horizontal=sort_horizontal): + polygon = Polygon(polygon_from_points(child.get_Coords().points)) + if horizontal: + return polygon.centroid.x + else: + return polygon.centroid.y + sorted_children = sorted(children, reverse=reverse, key=polygon_position) + sorted_concat_text = get_text(sorted_children, joiner) + + if (segment_text == sorted_concat_text or + segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')): + LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id) + adoption(sorted_children) else: - LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', - page_id, region.id, - str([line.id for line in lines]), - str([line.id for line in sorted_lines]), - lines_text, region_text) + LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s does not suffice to turn "%s" into "%s"', + page_id, segment.id, + str([seg.id for seg in children]), + str([seg.id for seg in sorted_children]), + concat_text, segment_text)