From 25437176d44b81f7a009b20bb3f0e51cdf3c6183 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 22 Nov 2019 16:50:55 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Fix=20line=20order=20in=20regions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ocrd_repair_inconsistencies.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index 8f46d05..1c34a8d 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -42,6 +42,21 @@ class RepairInconsistencies(Processor): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] super(RepairInconsistencies, self).__init__(*args, **kwargs) + def _fix_lines(self, region): + """Fix line order in a region""" + + lines = region.get_TextLine() + region_text = get_text(region) + lines_text = get_text(lines, '\n') + if region_text != lines_text: + # XXX Assumes top-to-bottom + sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) + sorted_lines_text = get_text(sorted_lines, '\n') + + if sorted_lines_text == region_text: + LOG.info('Fixing line order of region "%s"', region.id) + region.set_TextLine(sorted_lines) + def _fix_words(self, line): """Fix word order in a line""" @@ -82,6 +97,7 @@ class RepairInconsistencies(Processor): regions = page.get_TextRegion() for region in regions: + self._fix_lines(region) lines = region.get_TextLine() for line in lines: