From 8d21cd8ab91d66826b4e57d53f7e4c66db68f78b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 22 Nov 2019 16:18:05 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89=20Initial=20commit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 108 +++++++++++++++++ ocrd_repair_inconsistencies/cli.py | 10 ++ ocrd_repair_inconsistencies/config.py | 4 + ocrd_repair_inconsistencies/ocrd-tool.json | 22 ++++ .../ocrd_repair_inconsistencies.py | 114 ++++++++++++++++++ requirements.txt | 3 + setup.py | 23 ++++ 7 files changed, 284 insertions(+) create mode 100644 .gitignore create mode 100644 ocrd_repair_inconsistencies/cli.py create mode 100644 ocrd_repair_inconsistencies/config.py create mode 100644 ocrd_repair_inconsistencies/ocrd-tool.json create mode 100644 ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb78ca3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,108 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# vim tmp +*.swp +*.swo diff --git a/ocrd_repair_inconsistencies/cli.py b/ocrd_repair_inconsistencies/cli.py new file mode 100644 index 0000000..b684068 --- /dev/null +++ b/ocrd_repair_inconsistencies/cli.py @@ -0,0 +1,10 @@ +import click + +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_repair_inconsistencies.ocrd_repair_inconsistencies import RepairInconsistencies + + +@click.command() +@ocrd_cli_options +def ocrd_repair_inconsistencies(*args, **kwargs): + return ocrd_cli_wrap_processor(RepairInconsistencies, *args, **kwargs) diff --git a/ocrd_repair_inconsistencies/config.py b/ocrd_repair_inconsistencies/config.py new file mode 100644 index 0000000..fc6e89e --- /dev/null +++ b/ocrd_repair_inconsistencies/config.py @@ -0,0 +1,4 @@ +import json +from pkg_resources import resource_string + +OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) \ No newline at end of file diff --git a/ocrd_repair_inconsistencies/ocrd-tool.json b/ocrd_repair_inconsistencies/ocrd-tool.json new file mode 100644 index 0000000..c77f431 --- /dev/null +++ b/ocrd_repair_inconsistencies/ocrd-tool.json @@ -0,0 +1,22 @@ +{ + "tools": { + "ocrd_repair_inconsistencies": { + "executable": "ocrd_repair_inconsistencies", + "categories": [ + "Layout analysis" + ], + "description": "Repair glyph/word/line order inconsistencies", + "input_file_grp": [ + "OCR-D-SEG-BLOCK" + ], + "output_file_grp": [ + "OCR-D-SEG-BLOCK-FIXED" + ], + "steps": [ + "layout/segmentation/region", + "layout/segmentation/line", + "layout/segmentation/words" + ] + } + } +} diff --git a/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py new file mode 100644 index 0000000..a73831b --- /dev/null +++ b/ocrd_repair_inconsistencies/ocrd_repair_inconsistencies.py @@ -0,0 +1,114 @@ +from __future__ import absolute_import + +import os.path +from collections import Sequence + +from shapely.geometry import Polygon, LineString + +from ocrd import Processor +from ocrd_utils import ( + getLogger, concat_padded, + coordinates_for_segment, + coordinates_of_segment, + polygon_from_points, + points_from_polygon, + xywh_from_polygon, + MIMETYPE_PAGE +) +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import ( + CoordsType, + LabelType, LabelsType, + MetadataItemType, + to_xml +) +from ocrd_models.ocrd_page_generateds import ( + RegionRefType, + RegionRefIndexedType, + OrderedGroupType, + OrderedGroupIndexedType, + UnorderedGroupType, + UnorderedGroupIndexedType, + ReadingOrderType +) +from .config import OCRD_TOOL + +TOOL = 'ocrd_repair_inconsistencies' +LOG = getLogger('processor.RepairInconsistencies') + +class RepairInconsistencies(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] + super(RepairInconsistencies, self).__init__(*args, **kwargs) + + def process(self): + for (n, input_file) in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %i / %s", n, page_id) + pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + regions = page.get_TextRegion() + for region in regions: + + lines = region.get_TextLine() + for line in lines: + + # Fix words in lines + words = line.get_Word() + line_text = get_text(line) + words_text = get_text(words, ' ') + if line_text != words_text: + # XXX Assumes left-to-right + sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) + sorted_words_text = get_text(sorted_words, ' ') + + if sorted_words_text == line_text: + LOG.info('Fixing word order of line "%s"', line.id) + line.set_Word(sorted_words) + + words = line.get_Word() + for word in words: + + # Fix glyphs in words + glyphs = word.get_Glyph() + word_text = get_text(word) + glyphs_text = get_text(glyphs, '') + if word_text != glyphs_text: + # XXX Assumes left-to-right + sorted_glyphs = sorted(glyphs, key=lambda g: Polygon( polygon_from_points(g.get_Coords().points)).centroid.x) + sorted_glyphs_text = get_text(sorted_glyphs, '') + + if sorted_glyphs_text == word_text: + LOG.info('Fixing glyph order of word "%s"', word.id) + word.set_Glyph(sorted_glyphs) + + file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + if file_id == input_file.ID: + file_id = concat_padded(self.output_file_grp, n) + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=input_file.pageId, + mimetype=MIMETYPE_PAGE, + local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), + content=to_xml(pcgts)) + + + +def get_text(thing, joiner=None): + """Get the text of the given thing, joining if necessary""" + + def _get_text_for_one(t): + # XXX Assumes len(TextEquiv) == 1 + try: + return t.get_TextEquiv()[0].get_Unicode() + except Exception: + return None + + if isinstance(thing, Sequence): + text = joiner.join(_get_text_for_one(t) for t in thing) + else: + text = _get_text_for_one(thing) + return text \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..682d127 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +click +shapely +ocrd \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f956bf5 --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +import codecs + +from setuptools import setup, find_packages + +setup( + name='ocrd_repair_inconsistencies', + description='Repair glyph/word/line order inconsistencies', + #long_description=codecs.open('README.md', encoding='utf-8').read(), + author='Mike Gerber', + author_email='mike.gerber@sbb.spk-berlin.de', + license='Apache License 2.0', + packages=find_packages(exclude=('tests', 'docs')), + install_requires=open('requirements.txt').read().split('\n'), + package_data={ + '': ['*.json', '*.yml', '*.yaml'], + }, + entry_points={ + 'console_scripts': [ + 'ocrd_repair_inconsistencies=ocrd_repair_inconsistencies.cli:ocrd_repair_inconsistencies', + ] + }, +)