Merge pull request #6 from bertsky/relax-concat-test

Relax concatenation test, backout gracefully
2025-12-23 02:34:14 +01:00 · 2019-12-11 16:00:10 +01:00 · 2019-12-11 16:00:10 +01:00 · 594d937ef5
commit 594d937ef5
parent 6ee105b17c 1e8544e2ab
7 changed files with 156 additions and 90 deletions
--- a/.gitignore
+++ b/.gitignore
@ -104,5 +104,6 @@ venv.bak/
 .mypy_cache/
 # vim tmp
 *~
 *.swp
 *.swo
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,24 @@
 [MASTER]
 [MESSAGES CONTROL]
 disable =
    ungrouped-imports,
    bad-continuation,
    missing-docstring,
    no-self-use,
    superfluous-parens,
    invalid-name,
    line-too-long,
    too-many-arguments,
    too-many-branches,
    too-many-statements,
    too-many-locals,
    too-few-public-methods,
    wrong-import-order,
    duplicate-code
 # allow indented whitespace (as required by interpreter):
 no-space-check=empty-line
 # allow non-snake-case identifiers:
 good-names=n,i
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 SHELL = /bin/bash
 PYTHON = python3
 PIP = pip3
 define HELP
 cat <<EOF
 ocrd_repair_inconsistencies
 Targets:
 	deps     Install Python dependencies via pip
 	install  Install Python package
 EOF
 endef
 export HELP
 help: ; @eval "$$HELP"
 deps:
 	$(PIP) install -r requirements.txt
 install:
 	$(PIP) install .
--- a/README.md
+++ b/README.md
@ -2,19 +2,48 @@
    Automatically re-order lines, words and glyphs to become textually consistent with their parents.
 ## Introduction
 PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates
-in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency
+iff such re-ordering fixes the inconsistency between their appropriately concatenated
-between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text.
+`TextEquiv` texts with their parent's `TextEquiv` text.
 If `TextEquiv` is missing, skip the respective elements.
 Where available, respect the annotated visual order:
 - For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated.  
  (Both `left-to-right` and `right-to-left` will be skipped currently.)
 - For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated.  
  (Both `top-to-bottom` and `bottom-to-top` will be skipped currently.)
 This processor does not affect `ReadingOrder` between regions, just the order of the XML elements
 below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`.
 We wrote this as a one-shot script to fix some files. Use with caution.
 ## Installation
-## Example usage
+(In your venv, run:)
 ```sh
 make deps     # or pip install -r requirements.txt
 make install  # or pip install .
 ```
 ## Usage
 Offers the following user interfaces:
 ### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies`
 To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML)
 documents in an [OCR-D](https://ocr-d.github.io) annotation workflow.
 ### Example
 Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces,
 and then replace it with the output on success:
 For example, use this fix script:
 ~~~sh
 #!/bin/bash
 set -e
--- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py
+++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py
@ -6,6 +6,8 @@ from collections import Sequence
 from ocrd import Processor
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
    TextRegionType, TextLineType, WordType,
    MetadataItemType, LabelsType, LabelType,
    to_xml
 )
 from ocrd_utils import (
@ -34,6 +36,19 @@ class RepairInconsistencies(Processor):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            # add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            regions = page.get_TextRegion()
            for region in regions:
@ -49,7 +64,7 @@ class RepairInconsistencies(Processor):
                             page_id, region.id, textLineOrder)
                    continue
-                _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top')
+                _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top'))
                lines = region.get_TextLine()
                for line in lines:
@ -65,7 +80,7 @@ class RepairInconsistencies(Processor):
                                 page_id, line.id, readingDirection)
                        continue
-                    _fix_words(line, page_id, reverse=readingDirection=='right-to-left')
+                    _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left'))
                    words = line.get_Word()
                    for word in words:
@ -81,7 +96,7 @@ class RepairInconsistencies(Processor):
                                     page_id, word.id, readingDirection)
                            continue
-                        _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left')
+                        _fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left'))
            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
@ -95,91 +110,66 @@ class RepairInconsistencies(Processor):
                content=to_xml(pcgts))
-def get_text(thing, joiner=None):
+def get_text(thing, joiner=''):
    """Get the text of the given thing, joining if necessary"""
-    def _get_text_for_one(t):
+    def _get_text_for_one(one):
        if len(t.get_TextEquiv()) != 1:
            raise NotImplementedError
        try:
-            return t.get_TextEquiv()[0].get_Unicode()
+            return one.get_TextEquiv()[0].get_Unicode()
        except Exception:
            LOG.warning('element "%s" has no text', one.id)
            return None
    if isinstance(thing, Sequence):
-        text = joiner.join(_get_text_for_one(t) for t in thing)
+        texts = [_get_text_for_one(part) for part in thing]
        if all(texts):
            return joiner.join(texts)
        return None
    return _get_text_for_one(thing)
 def _fix_segment(segment, page_id, reverse=False):
    """Fix order of child elements of (region/line/word) segment."""
    if isinstance(segment, TextRegionType):
        joiner = '\n'
        sort_horizontal = False
        children = segment.get_TextLine()
        adoption = segment.set_TextLine
    elif isinstance(segment, TextLineType):
        joiner = ' '
        sort_horizontal = True
        children = segment.get_Word()
        adoption = segment.set_Word
    elif isinstance(segment, WordType):
        joiner = ''
        sort_horizontal = True
        children = segment.get_Glyph()
        adoption = segment.set_Glyph
    else:
-        text = _get_text_for_one(thing)
+        raise Exception('invalid element type %s of segment to fix' % type(segment))
-    return text
+    if not children:
 def _fix_words(line, page_id, reverse=False):
    """Fix word order in a line"""
    words = line.get_Word()
    if not words:
        return
-    line_text = get_text(line)
+    segment_text = get_text(segment)
-    words_text = get_text(words, ' ')
+    concat_text = get_text(children, joiner)
-    if line_text != words_text:
+    if (segment_text and concat_text and
-        sorted_words = sorted(words, reverse=reverse,
+        segment_text != concat_text and
-                              key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
+        segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
-        sorted_words_text = get_text(sorted_words, ' ')
+        def polygon_position(child, horizontal=sort_horizontal):
            polygon = Polygon(polygon_from_points(child.get_Coords().points))
            if horizontal:
                return polygon.centroid.x
            return polygon.centroid.y
        sorted_children = sorted(children, reverse=reverse, key=polygon_position)
        sorted_concat_text = get_text(sorted_children, joiner)
-        if sorted_words_text == line_text:
+        if (segment_text == sorted_concat_text or
-            LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
+            segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')):
-            line.set_Word(sorted_words)
+            LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id)
            adoption(sorted_children)
        else:
-            LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
+            LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' +
-                      page_id, line.id,
+                      'does not suffice to turn "%s" into "%s"',
-                      str([word.id for word in words]),
+                      page_id, segment.id,
-                      str([word.id for word in sorted_words]),
+                      str([seg.id for seg in children]),
-                      words_text, line_text)
+                      str([seg.id for seg in sorted_children]),
-
+                      concat_text, segment_text)
 def _fix_glyphs(word, page_id, reverse=False):
    """Fix glyph order in a word"""
    glyphs = word.get_Glyph()
    if not glyphs:
        return
    word_text = get_text(word)
    glyphs_text = get_text(glyphs, '')
    if word_text != glyphs_text:
        sorted_glyphs = sorted(glyphs, reverse=reverse,
                               key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
        sorted_glyphs_text = get_text(sorted_glyphs, '')
        if sorted_glyphs_text == word_text:
            LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
            word.set_Glyph(sorted_glyphs)
        else:
            LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"',
                      page_id, word.id,
                      str([glyph.id for glyph in glyphs]),
                      str([glyph.id for glyph in sorted_glyphs]),
                      glyphs_text, word_text)
 def _fix_lines(region, page_id, reverse=False):
    """Fix line order in a region"""
    lines = region.get_TextLine()
    if not lines:
        return
    region_text = get_text(region)
    lines_text = get_text(lines, '\n')
    if region_text != lines_text:
        sorted_lines = sorted(lines, reverse=reverse,
                              key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
        sorted_lines_text = get_text(sorted_lines, '\n')
        if sorted_lines_text == region_text:
            LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
            region.set_TextLine(sorted_lines)
        else:
            LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
                      page_id, region.id,
                      str([line.id for line in lines]),
                      str([line.id for line in sorted_lines]),
                      lines_text, region_text)
--- a/setup.py
+++ b/setup.py
@ -6,7 +6,8 @@ from setuptools import setup, find_packages
 setup(
    name='ocrd_repair_inconsistencies',
    description='Repair glyph/word/line order inconsistencies',
-    #long_description=codecs.open('README.md', encoding='utf-8').read(),
+    long_description=codecs.open('README.md', encoding='utf-8').read(),
    long_description_content_type='text/markdown',
    author='Mike Gerber',
    author_email='mike.gerber@sbb.spk-berlin.de',
    license='Apache License 2.0',