🎉 Initial commit

pull/1/head
Gerber, Mike 5 years ago
commit 8d21cd8ab9

108
.gitignore vendored

@ -0,0 +1,108 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# vim tmp
*.swp
*.swo

@ -0,0 +1,10 @@
import click
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_repair_inconsistencies.ocrd_repair_inconsistencies import RepairInconsistencies
@click.command()
@ocrd_cli_options
def ocrd_repair_inconsistencies(*args, **kwargs):
return ocrd_cli_wrap_processor(RepairInconsistencies, *args, **kwargs)

@ -0,0 +1,4 @@
import json
from pkg_resources import resource_string
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))

@ -0,0 +1,22 @@
{
"tools": {
"ocrd_repair_inconsistencies": {
"executable": "ocrd_repair_inconsistencies",
"categories": [
"Layout analysis"
],
"description": "Repair glyph/word/line order inconsistencies",
"input_file_grp": [
"OCR-D-SEG-BLOCK"
],
"output_file_grp": [
"OCR-D-SEG-BLOCK-FIXED"
],
"steps": [
"layout/segmentation/region",
"layout/segmentation/line",
"layout/segmentation/words"
]
}
}
}

@ -0,0 +1,114 @@
from __future__ import absolute_import
import os.path
from collections import Sequence
from shapely.geometry import Polygon, LineString
from ocrd import Processor
from ocrd_utils import (
getLogger, concat_padded,
coordinates_for_segment,
coordinates_of_segment,
polygon_from_points,
points_from_polygon,
xywh_from_polygon,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
CoordsType,
LabelType, LabelsType,
MetadataItemType,
to_xml
)
from ocrd_models.ocrd_page_generateds import (
RegionRefType,
RegionRefIndexedType,
OrderedGroupType,
OrderedGroupIndexedType,
UnorderedGroupType,
UnorderedGroupIndexedType,
ReadingOrderType
)
from .config import OCRD_TOOL
TOOL = 'ocrd_repair_inconsistencies'
LOG = getLogger('processor.RepairInconsistencies')
class RepairInconsistencies(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
super(RepairInconsistencies, self).__init__(*args, **kwargs)
def process(self):
for (n, input_file) in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()
regions = page.get_TextRegion()
for region in regions:
lines = region.get_TextLine()
for line in lines:
# Fix words in lines
words = line.get_Word()
line_text = get_text(line)
words_text = get_text(words, ' ')
if line_text != words_text:
# XXX Assumes left-to-right
sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
sorted_words_text = get_text(sorted_words, ' ')
if sorted_words_text == line_text:
LOG.info('Fixing word order of line "%s"', line.id)
line.set_Word(sorted_words)
words = line.get_Word()
for word in words:
# Fix glyphs in words
glyphs = word.get_Glyph()
word_text = get_text(word)
glyphs_text = get_text(glyphs, '')
if word_text != glyphs_text:
# XXX Assumes left-to-right
sorted_glyphs = sorted(glyphs, key=lambda g: Polygon( polygon_from_points(g.get_Coords().points)).centroid.x)
sorted_glyphs_text = get_text(sorted_glyphs, '')
if sorted_glyphs_text == word_text:
LOG.info('Fixing glyph order of word "%s"', word.id)
word.set_Glyph(sorted_glyphs)
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
content=to_xml(pcgts))
def get_text(thing, joiner=None):
"""Get the text of the given thing, joining if necessary"""
def _get_text_for_one(t):
# XXX Assumes len(TextEquiv) == 1
try:
return t.get_TextEquiv()[0].get_Unicode()
except Exception:
return None
if isinstance(thing, Sequence):
text = joiner.join(_get_text_for_one(t) for t in thing)
else:
text = _get_text_for_one(thing)
return text

@ -0,0 +1,3 @@
click
shapely
ocrd

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
import codecs
from setuptools import setup, find_packages
setup(
name='ocrd_repair_inconsistencies',
description='Repair glyph/word/line order inconsistencies',
#long_description=codecs.open('README.md', encoding='utf-8').read(),
author='Mike Gerber',
author_email='mike.gerber@sbb.spk-berlin.de',
license='Apache License 2.0',
packages=find_packages(exclude=('tests', 'docs')),
install_requires=open('requirements.txt').read().split('\n'),
package_data={
'': ['*.json', '*.yml', '*.yaml'],
},
entry_points={
'console_scripts': [
'ocrd_repair_inconsistencies=ocrd_repair_inconsistencies.cli:ocrd_repair_inconsistencies',
]
},
)
Loading…
Cancel
Save