Merge pull request #6 from bertsky/relax-concat-test

Relax concatenation test, backout gracefully
2025-08-08 17:09:52 +02:00 · 2019-12-11 16:00:10 +01:00 · 2019-12-11 16:00:10 +01:00 · 594d937ef5
commit 594d937ef5
parent 6ee105b17c 1e8544e2ab
7 changed files with 156 additions and 90 deletions
--- a/.gitignore
+++ b/.gitignore
@ -104,5 +104,6 @@ venv.bak/
 .mypy_cache/

 # vim tmp
+*~
 *.swp
 *.swo
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,24 @@
+[MASTER]
+
+[MESSAGES CONTROL]
+disable =
+    ungrouped-imports,
+    bad-continuation,
+    missing-docstring,
+    no-self-use,
+    superfluous-parens,
+    invalid-name,
+    line-too-long,
+    too-many-arguments,
+    too-many-branches,
+    too-many-statements,
+    too-many-locals,
+    too-few-public-methods,
+    wrong-import-order,
+    duplicate-code
+
+# allow indented whitespace (as required by interpreter):
+no-space-check=empty-line
+
+# allow non-snake-case identifiers:
+good-names=n,i
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+SHELL = /bin/bash
+PYTHON = python3
+PIP = pip3
+
+define HELP
+cat <<EOF
+ocrd_repair_inconsistencies
+
+Targets:
+	deps     Install Python dependencies via pip
+	install  Install Python package
+EOF
+endef
+export HELP
+help: ; @eval "$$HELP"
+
+deps:
+	$(PIP) install -r requirements.txt
+
+install:
+	$(PIP) install .
--- a/README.md
+++ b/README.md
@ -1,20 +1,49 @@
 # ocrd_repair_inconsistencies

-Automatically re-order lines, words and glyphs to become textually consistent with their parents.
+    Automatically re-order lines, words and glyphs to become textually consistent with their parents.
+
+## Introduction

 PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates
-in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency
-between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text.
+iff such re-ordering fixes the inconsistency between their appropriately concatenated
+`TextEquiv` texts with their parent's `TextEquiv` text.
+
+If `TextEquiv` is missing, skip the respective elements.
+
+Where available, respect the annotated visual order:
+- For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated.  
+  (Both `left-to-right` and `right-to-left` will be skipped currently.)
+- For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated.  
+  (Both `top-to-bottom` and `bottom-to-top` will be skipped currently.)

 This processor does not affect `ReadingOrder` between regions, just the order of the XML elements
 below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`.

 We wrote this as a one-shot script to fix some files. Use with caution.

+## Installation

-## Example usage
+(In your venv, run:)
+
+```sh
+make deps     # or pip install -r requirements.txt
+make install  # or pip install .
+```
+
+## Usage
+
+Offers the following user interfaces:
+
+### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies`
+
+To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML)
+documents in an [OCR-D](https://ocr-d.github.io) annotation workflow.
+
+### Example
+
+Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces,
+and then replace it with the output on success:

-For example, use this fix script:
 ~~~sh
 #!/bin/bash
 set -e
--- a/ocrd_repair_inconsistencies/config.py
+++ b/ocrd_repair_inconsistencies/config.py
@ -1,4 +1,4 @@
 import json
 from pkg_resources import resource_string

-OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
--- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py
+++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py
@ -6,6 +6,8 @@ from collections import Sequence
 from ocrd import Processor
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
+    TextRegionType, TextLineType, WordType,
+    MetadataItemType, LabelsType, LabelType,
    to_xml
 )
 from ocrd_utils import (
@ -33,7 +35,20 @@ class RepairInconsistencies(Processor):
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
-
+            
+            # add metadata about this operation and its runtime parameters:
+            metadata = pcgts.get_Metadata() # ensured by from_file()
+            metadata.add_MetadataItem(
+                MetadataItemType(type_="processingStep",
+                                 name=self.ocrd_tool['steps'][0],
+                                 value=TOOL,
+                                 Labels=[LabelsType(
+                                     externalModel="ocrd-tool",
+                                     externalId="parameters",
+                                     Label=[LabelType(type_=name,
+                                                      value=self.parameter[name])
+                                            for name in self.parameter.keys()])]))
+            
            regions = page.get_TextRegion()

            for region in regions:
@ -49,7 +64,7 @@ class RepairInconsistencies(Processor):
                             page_id, region.id, textLineOrder)
                    continue

-                _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top')
+                _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top'))

                lines = region.get_TextLine()
                for line in lines:
@ -65,7 +80,7 @@ class RepairInconsistencies(Processor):
                                 page_id, line.id, readingDirection)
                        continue
                    
-                    _fix_words(line, page_id, reverse=readingDirection=='right-to-left')
+                    _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left'))

                    words = line.get_Word()
                    for word in words:
@ -81,7 +96,7 @@ class RepairInconsistencies(Processor):
                                     page_id, word.id, readingDirection)
                            continue

-                        _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left')
+                        _fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left'))

            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
@ -95,91 +110,66 @@ class RepairInconsistencies(Processor):
                content=to_xml(pcgts))


-def get_text(thing, joiner=None):
+def get_text(thing, joiner=''):
    """Get the text of the given thing, joining if necessary"""

-    def _get_text_for_one(t):
-        if len(t.get_TextEquiv()) != 1:
-            raise NotImplementedError
+    def _get_text_for_one(one):
        try:
-            return t.get_TextEquiv()[0].get_Unicode()
+            return one.get_TextEquiv()[0].get_Unicode()
        except Exception:
+            LOG.warning('element "%s" has no text', one.id)
            return None
-
+    
    if isinstance(thing, Sequence):
-        text = joiner.join(_get_text_for_one(t) for t in thing)
+        texts = [_get_text_for_one(part) for part in thing]
+        if all(texts):
+            return joiner.join(texts)
+        return None
+    return _get_text_for_one(thing)
+
+def _fix_segment(segment, page_id, reverse=False):
+    """Fix order of child elements of (region/line/word) segment."""
+    
+    if isinstance(segment, TextRegionType):
+        joiner = '\n'
+        sort_horizontal = False
+        children = segment.get_TextLine()
+        adoption = segment.set_TextLine
+    elif isinstance(segment, TextLineType):
+        joiner = ' '
+        sort_horizontal = True
+        children = segment.get_Word()
+        adoption = segment.set_Word
+    elif isinstance(segment, WordType):
+        joiner = ''
+        sort_horizontal = True
+        children = segment.get_Glyph()
+        adoption = segment.set_Glyph
    else:
-        text = _get_text_for_one(thing)
-    return text
-
-
-def _fix_words(line, page_id, reverse=False):
-    """Fix word order in a line"""
-
-    words = line.get_Word()
-    if not words:
+        raise Exception('invalid element type %s of segment to fix' % type(segment))
+    if not children:
        return
-    line_text = get_text(line)
-    words_text = get_text(words, ' ')
-    if line_text != words_text:
-        sorted_words = sorted(words, reverse=reverse,
-                              key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
-        sorted_words_text = get_text(sorted_words, ' ')
-
-        if sorted_words_text == line_text:
-            LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
-            line.set_Word(sorted_words)
+    segment_text = get_text(segment)
+    concat_text = get_text(children, joiner)
+    if (segment_text and concat_text and
+        segment_text != concat_text and
+        segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
+        def polygon_position(child, horizontal=sort_horizontal):
+            polygon = Polygon(polygon_from_points(child.get_Coords().points))
+            if horizontal:
+                return polygon.centroid.x
+            return polygon.centroid.y
+        sorted_children = sorted(children, reverse=reverse, key=polygon_position)
+        sorted_concat_text = get_text(sorted_children, joiner)
+        
+        if (segment_text == sorted_concat_text or
+            segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')):
+            LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id)
+            adoption(sorted_children)
        else:
-            LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
-                      page_id, line.id,
-                      str([word.id for word in words]),
-                      str([word.id for word in sorted_words]),
-                      words_text, line_text)
-
-
-def _fix_glyphs(word, page_id, reverse=False):
-    """Fix glyph order in a word"""
-
-    glyphs = word.get_Glyph()
-    if not glyphs:
-        return
-    word_text = get_text(word)
-    glyphs_text = get_text(glyphs, '')
-    if word_text != glyphs_text:
-        sorted_glyphs = sorted(glyphs, reverse=reverse,
-                               key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
-        sorted_glyphs_text = get_text(sorted_glyphs, '')
-
-        if sorted_glyphs_text == word_text:
-            LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
-            word.set_Glyph(sorted_glyphs)
-        else:
-            LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"',
-                      page_id, word.id,
-                      str([glyph.id for glyph in glyphs]),
-                      str([glyph.id for glyph in sorted_glyphs]),
-                      glyphs_text, word_text)
-
-
-def _fix_lines(region, page_id, reverse=False):
-    """Fix line order in a region"""
-
-    lines = region.get_TextLine()
-    if not lines:
-        return
-    region_text = get_text(region)
-    lines_text = get_text(lines, '\n')
-    if region_text != lines_text:
-        sorted_lines = sorted(lines, reverse=reverse,
-                              key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
-        sorted_lines_text = get_text(sorted_lines, '\n')
-
-        if sorted_lines_text == region_text:
-            LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
-            region.set_TextLine(sorted_lines)
-        else:
-            LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
-                      page_id, region.id,
-                      str([line.id for line in lines]),
-                      str([line.id for line in sorted_lines]),
-                      lines_text, region_text)
+            LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' +
+                      'does not suffice to turn "%s" into "%s"',
+                      page_id, segment.id,
+                      str([seg.id for seg in children]),
+                      str([seg.id for seg in sorted_children]),
+                      concat_text, segment_text)
--- a/setup.py
+++ b/setup.py
@ -6,7 +6,8 @@ from setuptools import setup, find_packages
 setup(
    name='ocrd_repair_inconsistencies',
    description='Repair glyph/word/line order inconsistencies',
-    #long_description=codecs.open('README.md', encoding='utf-8').read(),
+    long_description=codecs.open('README.md', encoding='utf-8').read(),
+    long_description_content_type='text/markdown',
    author='Mike Gerber',
    author_email='mike.gerber@sbb.spk-berlin.de',
    license='Apache License 2.0',