From f829015bb51899859a9d4a2b0523eddb21096a10 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 17:19:56 +0100 Subject: [PATCH 1/9] relax concatenation test: text must be equal irrespective of tokenization/joiner --- .../ocrd_repair_inconsistencies.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index 282a070..50b26ef 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -126,11 +126,12 @@ def _fix_words(line, page_id, reverse=False): key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) sorted_words_text = get_text(sorted_words, ' ') - if sorted_words_text == line_text: + if (sorted_words_text == line_text or + sorted_words_text.replace(' ', '') == line_text.replace(' ', '')): LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) line.set_Word(sorted_words) else: - LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', + LOG.debug('Resorting lines of page "%s" line "%s" from %s to %s does not suffice to turn "%s" into "%s"', page_id, line.id, str([word.id for word in words]), str([word.id for word in sorted_words]), @@ -174,7 +175,8 @@ def _fix_lines(region, page_id, reverse=False): key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) sorted_lines_text = get_text(sorted_lines, '\n') - if sorted_lines_text == region_text: + if (sorted_lines_text == region_text or + sorted_lines_text.replace('\n', '') == region_text.replace('\n', '')): LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) region.set_TextLine(sorted_lines) else: From 9002606e1c119d0fd55ea79ea4a0c43f17d4303f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 17:46:16 +0100 Subject: [PATCH 2/9] unify function for 3 levels --- .../ocrd_repair_inconsistencies.py | 120 +++++++----------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index 50b26ef..bca6888 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -6,6 +6,7 @@ from collections import Sequence from ocrd import Processor from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( + TextRegionType, TextLineType, WordType, to_xml ) from ocrd_utils import ( @@ -49,7 +50,7 @@ class RepairInconsistencies(Processor): page_id, region.id, textLineOrder) continue - _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top') + _fix_segment(region, page_id, reverse=textLineOrder=='bottom-to-top') lines = region.get_TextLine() for line in lines: @@ -65,7 +66,7 @@ class RepairInconsistencies(Processor): page_id, line.id, readingDirection) continue - _fix_words(line, page_id, reverse=readingDirection=='right-to-left') + _fix_segment(line, page_id, reverse=readingDirection=='right-to-left') words = line.get_Word() for word in words: @@ -81,7 +82,7 @@ class RepairInconsistencies(Processor): page_id, word.id, readingDirection) continue - _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left') + _fix_segment(word, page_id, reverse=readingDirection=='right-to-left') file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: @@ -113,75 +114,48 @@ def get_text(thing, joiner=None): return text -def _fix_words(line, page_id, reverse=False): - """Fix word order in a line""" - - words = line.get_Word() - if not words: - return - line_text = get_text(line) - words_text = get_text(words, ' ') - if line_text != words_text: - sorted_words = sorted(words, reverse=reverse, - key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) - sorted_words_text = get_text(sorted_words, ' ') - - if (sorted_words_text == line_text or - sorted_words_text.replace(' ', '') == line_text.replace(' ', '')): - LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) - line.set_Word(sorted_words) - else: - LOG.debug('Resorting lines of page "%s" line "%s" from %s to %s does not suffice to turn "%s" into "%s"', - page_id, line.id, - str([word.id for word in words]), - str([word.id for word in sorted_words]), - words_text, line_text) - - -def _fix_glyphs(word, page_id, reverse=False): - """Fix glyph order in a word""" - - glyphs = word.get_Glyph() - if not glyphs: - return - word_text = get_text(word) - glyphs_text = get_text(glyphs, '') - if word_text != glyphs_text: - sorted_glyphs = sorted(glyphs, reverse=reverse, - key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) - sorted_glyphs_text = get_text(sorted_glyphs, '') - - if sorted_glyphs_text == word_text: - LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id) - word.set_Glyph(sorted_glyphs) - else: - LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"', - page_id, word.id, - str([glyph.id for glyph in glyphs]), - str([glyph.id for glyph in sorted_glyphs]), - glyphs_text, word_text) - - -def _fix_lines(region, page_id, reverse=False): - """Fix line order in a region""" - - lines = region.get_TextLine() - if not lines: +def _fix_segment(segment, page_id, reverse=False): + """Fix order of child elements of (region/line/word) segment.""" + + if isinstance(segment, TextRegionType): + joiner = '\n' + sort_horizontal = False + children = segment.get_TextLine() + adoption = segment.set_TextLine + elif isinstance(segment, TextLineType): + joiner = ' ' + sort_horizontal = True + children = segment.get_Word() + adoption = segment.set_Word + elif isinstance(segment, WordType): + joiner = '' + sort_horizontal = True + children = segment.get_Glyph() + adoption = segment.set_Glyph + else: + raise Exception('invalid element type %s of segment to fix' % type(segment)) + if not children: return - region_text = get_text(region) - lines_text = get_text(lines, '\n') - if region_text != lines_text: - sorted_lines = sorted(lines, reverse=reverse, - key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) - sorted_lines_text = get_text(sorted_lines, '\n') - - if (sorted_lines_text == region_text or - sorted_lines_text.replace('\n', '') == region_text.replace('\n', '')): - LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) - region.set_TextLine(sorted_lines) + segment_text = get_text(segment) + concat_text = get_text(children, joiner) + if (segment_text != concat_text and + segment_text.replace(joiner, '') != concat_text.replace(joiner, '')): + def polygon_position(child, horizontal=sort_horizontal): + polygon = Polygon(polygon_from_points(child.get_Coords().points)) + if horizontal: + return polygon.centroid.x + else: + return polygon.centroid.y + sorted_children = sorted(children, reverse=reverse, key=polygon_position) + sorted_concat_text = get_text(sorted_children, joiner) + + if (segment_text == sorted_concat_text or + segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')): + LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id) + adoption(sorted_children) else: - LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', - page_id, region.id, - str([line.id for line in lines]), - str([line.id for line in sorted_lines]), - lines_text, region_text) + LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s does not suffice to turn "%s" into "%s"', + page_id, segment.id, + str([seg.id for seg in children]), + str([seg.id for seg in sorted_children]), + concat_text, segment_text) From ad8f25666b6799a5b403be91c7ffabee21b6212c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 18:06:37 +0100 Subject: [PATCH 3/9] backout gracefully when text annotation is missing --- .../ocrd_repair_inconsistencies.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index bca6888..bc8bb2b 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -96,23 +96,24 @@ class RepairInconsistencies(Processor): content=to_xml(pcgts)) -def get_text(thing, joiner=None): +def get_text(thing, joiner=''): """Get the text of the given thing, joining if necessary""" - def _get_text_for_one(t): - if len(t.get_TextEquiv()) != 1: - raise NotImplementedError + def _get_text_for_one(one): try: - return t.get_TextEquiv()[0].get_Unicode() + return one.get_TextEquiv()[0].get_Unicode() except Exception: + LOG.warning('element "%s" has no text', one.id) return None - + if isinstance(thing, Sequence): - text = joiner.join(_get_text_for_one(t) for t in thing) + texts = [_get_text_for_one(part) for part in thing] + if all(texts): + return joiner.join(texts) + else: + return None else: - text = _get_text_for_one(thing) - return text - + return _get_text_for_one(thing) def _fix_segment(segment, page_id, reverse=False): """Fix order of child elements of (region/line/word) segment.""" @@ -138,7 +139,8 @@ def _fix_segment(segment, page_id, reverse=False): return segment_text = get_text(segment) concat_text = get_text(children, joiner) - if (segment_text != concat_text and + if (segment_text and concat_text and + segment_text != concat_text and segment_text.replace(joiner, '') != concat_text.replace(joiner, '')): def polygon_position(child, horizontal=sort_horizontal): polygon = Polygon(polygon_from_points(child.get_Coords().points)) From e16438a37798385caf00e217e2ee251d686bd058 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 18:15:56 +0100 Subject: [PATCH 4/9] add MetadataItem about processor --- .../ocrd_repair_inconsistencies.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index bc8bb2b..038922a 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -7,6 +7,7 @@ from ocrd import Processor from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( TextRegionType, TextLineType, WordType, + MetadataItemType, LabelsType, LabelType, to_xml ) from ocrd_utils import ( @@ -34,7 +35,20 @@ class RepairInconsistencies(Processor): LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() - + + # add metadata about this operation and its runtime parameters: + metadata = pcgts.get_Metadata() # ensured by from_file() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + regions = page.get_TextRegion() for region in regions: From d5094ee42b284d825fe368c6274275aeaa7f9011 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 18:20:28 +0100 Subject: [PATCH 5/9] pylint karma --- .pylintrc | 24 +++++++++++++++++++ ocrd_repair_inconsistencies/config.py | 2 +- .../ocrd_repair_inconsistencies.py | 18 +++++++------- 3 files changed, 33 insertions(+), 11 deletions(-) create mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..f4d4c09 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,24 @@ +[MASTER] + +[MESSAGES CONTROL] +disable = + ungrouped-imports, + bad-continuation, + missing-docstring, + no-self-use, + superfluous-parens, + invalid-name, + line-too-long, + too-many-arguments, + too-many-branches, + too-many-statements, + too-many-locals, + too-few-public-methods, + wrong-import-order, + duplicate-code + +# allow indented whitespace (as required by interpreter): +no-space-check=empty-line + +# allow non-snake-case identifiers: +good-names=n,i diff --git a/ocrd_repair_inconsistencies/config.py b/ocrd_repair_inconsistencies/config.py index fc6e89e..01e0b23 100644 --- a/ocrd_repair_inconsistencies/config.py +++ b/ocrd_repair_inconsistencies/config.py @@ -1,4 +1,4 @@ import json from pkg_resources import resource_string -OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) \ No newline at end of file +OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py index 038922a..3e47b51 100644 --- a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -64,7 +64,7 @@ class RepairInconsistencies(Processor): page_id, region.id, textLineOrder) continue - _fix_segment(region, page_id, reverse=textLineOrder=='bottom-to-top') + _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top')) lines = region.get_TextLine() for line in lines: @@ -80,7 +80,7 @@ class RepairInconsistencies(Processor): page_id, line.id, readingDirection) continue - _fix_segment(line, page_id, reverse=readingDirection=='right-to-left') + _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left')) words = line.get_Word() for word in words: @@ -96,7 +96,7 @@ class RepairInconsistencies(Processor): page_id, word.id, readingDirection) continue - _fix_segment(word, page_id, reverse=readingDirection=='right-to-left') + _fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left')) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: @@ -124,10 +124,8 @@ def get_text(thing, joiner=''): texts = [_get_text_for_one(part) for part in thing] if all(texts): return joiner.join(texts) - else: - return None - else: - return _get_text_for_one(thing) + return None + return _get_text_for_one(thing) def _fix_segment(segment, page_id, reverse=False): """Fix order of child elements of (region/line/word) segment.""" @@ -160,8 +158,7 @@ def _fix_segment(segment, page_id, reverse=False): polygon = Polygon(polygon_from_points(child.get_Coords().points)) if horizontal: return polygon.centroid.x - else: - return polygon.centroid.y + return polygon.centroid.y sorted_children = sorted(children, reverse=reverse, key=polygon_position) sorted_concat_text = get_text(sorted_children, joiner) @@ -170,7 +167,8 @@ def _fix_segment(segment, page_id, reverse=False): LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id) adoption(sorted_children) else: - LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s does not suffice to turn "%s" into "%s"', + LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' + + 'does not suffice to turn "%s" into "%s"', page_id, segment.id, str([seg.id for seg in children]), str([seg.id for seg in sorted_children]), From 2a2c97feffed3ae4a80c5081f332e7c2f5e747cc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 18:21:47 +0100 Subject: [PATCH 6/9] setup: version + include long description --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9f56fc1..3bbca21 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,10 @@ from setuptools import setup, find_packages setup( name='ocrd_repair_inconsistencies', + version='0.1.0', description='Repair glyph/word/line order inconsistencies', - #long_description=codecs.open('README.md', encoding='utf-8').read(), + long_description=codecs.open('README.md', encoding='utf-8').read(), + long_description_content_type='text/markdown', author='Mike Gerber', author_email='mike.gerber@sbb.spk-berlin.de', license='Apache License 2.0', From bb925a6a5b636c19ecf79749417ecf7c1548b3cb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 29 Nov 2019 18:30:48 +0100 Subject: [PATCH 7/9] add simple Makefile --- .gitignore | 1 + Makefile | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 Makefile diff --git a/.gitignore b/.gitignore index fb78ca3..f0f4919 100644 --- a/.gitignore +++ b/.gitignore @@ -104,5 +104,6 @@ venv.bak/ .mypy_cache/ # vim tmp +*~ *.swp *.swo diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d691d9b --- /dev/null +++ b/Makefile @@ -0,0 +1,21 @@ +SHELL = /bin/bash +PYTHON = python3 +PIP = pip3 + +define HELP +cat < Date: Fri, 29 Nov 2019 18:45:10 +0100 Subject: [PATCH 8/9] update README --- README.md | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3d056f5..3b51455 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,49 @@ # ocrd_repair_inconsistencies -Automatically re-order lines, words and glyphs to become textually consistent with their parents. + Automatically re-order lines, words and glyphs to become textually consistent with their parents. + +## Introduction PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates -in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency -between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text. +iff such re-ordering fixes the inconsistency between their appropriately concatenated +`TextEquiv` texts with their parent's `TextEquiv` text. + +If `TextEquiv` is missing, skip the respective elements. + +Where available, respect the annotated visual order: +- For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated. + (Both `left-to-right` and `right-to-left` will be skipped currently.) +- For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated. + (Both `top-to-bottom` and `bottom-to-top` will be skipped currently.) This processor does not affect `ReadingOrder` between regions, just the order of the XML elements below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`. We wrote this as a one-shot script to fix some files. Use with caution. +## Installation + +(In your venv, run:) + +```sh +make deps # or pip install -r requirements.txt +make install # or pip install . +``` + +## Usage + +Offers the following user interfaces: + +### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies` + +To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML) +documents in an [OCR-D](https://ocr-d.github.io) annotation workflow. + +### Example -## Example usage +Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces, +and then replace it with the output on success: -For example, use this fix script: ~~~sh #!/bin/bash set -e From 1e8544e2abfc17581de40fa105ed02553e6bd0c9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 11 Dec 2019 14:49:03 +0000 Subject: [PATCH 9/9] no version then --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 3bbca21..da71f74 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ from setuptools import setup, find_packages setup( name='ocrd_repair_inconsistencies', - version='0.1.0', description='Repair glyph/word/line order inconsistencies', long_description=codecs.open('README.md', encoding='utf-8').read(), long_description_content_type='text/markdown',