diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index d0d6117..41b3f7c 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -20,57 +20,13 @@ from .config import OCRD_TOOL TOOL = 'ocrd_repair_inconsistencies' LOG = getLogger('processor.RepairInconsistencies') + class RepairInconsistencies(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] super(RepairInconsistencies, self).__init__(*args, **kwargs) - def _fix_lines(self, region): - """Fix line order in a region""" - - lines = region.get_TextLine() - region_text = get_text(region) - lines_text = get_text(lines, '\n') - if region_text != lines_text: - # XXX Assumes top-to-bottom - sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) - sorted_lines_text = get_text(sorted_lines, '\n') - - if sorted_lines_text == region_text: - LOG.info('Fixing line order of region "%s"', region.id) - region.set_TextLine(sorted_lines) - - def _fix_words(self, line): - """Fix word order in a line""" - - words = line.get_Word() - line_text = get_text(line) - words_text = get_text(words, ' ') - if line_text != words_text: - # XXX Assumes left-to-right - sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) - sorted_words_text = get_text(sorted_words, ' ') - - if sorted_words_text == line_text: - LOG.info('Fixing word order of line "%s"', line.id) - line.set_Word(sorted_words) - - def _fix_glyphs(self, word): - """Fix glyph order in a word""" - - glyphs = word.get_Glyph() - word_text = get_text(word) - glyphs_text = get_text(glyphs, '') - if word_text != glyphs_text: - # XXX Assumes left-to-right - sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) - sorted_glyphs_text = get_text(sorted_glyphs, '') - - if sorted_glyphs_text == word_text: - LOG.info('Fixing glyph order of word "%s"', word.id) - word.set_Glyph(sorted_glyphs) - def process(self): for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID @@ -78,18 +34,17 @@ class RepairInconsistencies(Processor): pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() - regions = page.get_TextRegion() for region in regions: - self._fix_lines(region) + _fix_lines(region) lines = region.get_TextLine() for line in lines: - self._fix_words(line) + _fix_words(line) words = line.get_Word() for word in words: - self._fix_glyphs(word) + _fix_glyphs(word) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: @@ -103,7 +58,6 @@ class RepairInconsistencies(Processor): content=to_xml(pcgts)) - def get_text(thing, joiner=None): """Get the text of the given thing, joining if necessary""" @@ -118,4 +72,52 @@ def get_text(thing, joiner=None): text = joiner.join(_get_text_for_one(t) for t in thing) else: text = _get_text_for_one(thing) - return text \ No newline at end of file + return text + + +def _fix_words(line): + """Fix word order in a line""" + + words = line.get_Word() + line_text = get_text(line) + words_text = get_text(words, ' ') + if line_text != words_text: + # XXX Assumes left-to-right + sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) + sorted_words_text = get_text(sorted_words, ' ') + + if sorted_words_text == line_text: + LOG.info('Fixing word order of line "%s"', line.id) + line.set_Word(sorted_words) + + +def _fix_glyphs(word): + """Fix glyph order in a word""" + + glyphs = word.get_Glyph() + word_text = get_text(word) + glyphs_text = get_text(glyphs, '') + if word_text != glyphs_text: + # XXX Assumes left-to-right + sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) + sorted_glyphs_text = get_text(sorted_glyphs, '') + + if sorted_glyphs_text == word_text: + LOG.info('Fixing glyph order of word "%s"', word.id) + word.set_Glyph(sorted_glyphs) + + +def _fix_lines(region): + """Fix line order in a region""" + + lines = region.get_TextLine() + region_text = get_text(region) + lines_text = get_text(lines, '\n') + if region_text != lines_text: + # XXX Assumes top-to-bottom + sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) + sorted_lines_text = get_text(sorted_lines, '\n') + + if sorted_lines_text == region_text: + LOG.info('Fixing line order of region "%s"', region.id) + region.set_TextLine(sorted_lines)