🎨 Extract methods to fix lines + words

2026-03-15 03:31:58 +01:00 · 2019-11-22 16:39:23 +01:00 · 2019-11-22 16:39:23 +01:00 · c7033a5d4d
commit c7033a5d4d
parent 8d21cd8ab9
1 changed files with 33 additions and 26 deletions
--- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py
+++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py
@ -42,20 +42,9 @@ class RepairInconsistencies(Processor):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        super(RepairInconsistencies, self).__init__(*args, **kwargs)

-    def process(self):
-        for (n, input_file) in enumerate(self.input_files):
-            page_id = input_file.pageId or input_file.ID
-            LOG.info("INPUT FILE %i / %s", n, page_id)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            page = pcgts.get_Page()
+    def _fix_words(self, line):
+        """Fix word order in a line"""

-            regions = page.get_TextRegion()
-            for region in regions:
-
-                lines = region.get_TextLine()
-                for line in lines:
-
-                    # Fix words in lines
        words = line.get_Word()
        line_text = get_text(line)
        words_text = get_text(words, ' ')
@ -68,22 +57,40 @@ class RepairInconsistencies(Processor):
                LOG.info('Fixing word order of line "%s"', line.id)
                line.set_Word(sorted_words)

-                    words = line.get_Word()
-                    for word in words:
+    def _fix_glyphs(self, word):
+        """Fix glyph order in a word"""

-                        # Fix glyphs in words
        glyphs = word.get_Glyph()
        word_text = get_text(word)
        glyphs_text = get_text(glyphs, '')
        if word_text != glyphs_text:
            # XXX Assumes left-to-right
-                            sorted_glyphs = sorted(glyphs, key=lambda g: Polygon( polygon_from_points(g.get_Coords().points)).centroid.x)
+            sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
            sorted_glyphs_text = get_text(sorted_glyphs, '')

            if sorted_glyphs_text == word_text:
                LOG.info('Fixing glyph order of word "%s"', word.id)
                word.set_Glyph(sorted_glyphs)

+    def process(self):
+        for (n, input_file) in enumerate(self.input_files):
+            page_id = input_file.pageId or input_file.ID
+            LOG.info("INPUT FILE %i / %s", n, page_id)
+            pcgts = page_from_file(self.workspace.download_file(input_file))
+            page = pcgts.get_Page()
+
+
+            regions = page.get_TextRegion()
+            for region in regions:
+
+                lines = region.get_TextLine()
+                for line in lines:
+                    self._fix_words(line)
+
+                    words = line.get_Word()
+                    for word in words:
+                        self._fix_glyphs(word)
+
            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)