Merge pull request #6 from bertsky/relax-concat-test

Relax concatenation test, backout gracefully
pull/10/head
Mike Gerber 5 years ago committed by GitHub
commit 594d937ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

1
.gitignore vendored

@ -104,5 +104,6 @@ venv.bak/
.mypy_cache/
# vim tmp
*~
*.swp
*.swo

@ -0,0 +1,24 @@
[MASTER]
[MESSAGES CONTROL]
disable =
ungrouped-imports,
bad-continuation,
missing-docstring,
no-self-use,
superfluous-parens,
invalid-name,
line-too-long,
too-many-arguments,
too-many-branches,
too-many-statements,
too-many-locals,
too-few-public-methods,
wrong-import-order,
duplicate-code
# allow indented whitespace (as required by interpreter):
no-space-check=empty-line
# allow non-snake-case identifiers:
good-names=n,i

@ -0,0 +1,21 @@
SHELL = /bin/bash
PYTHON = python3
PIP = pip3
define HELP
cat <<EOF
ocrd_repair_inconsistencies
Targets:
deps Install Python dependencies via pip
install Install Python package
EOF
endef
export HELP
help: ; @eval "$$HELP"
deps:
$(PIP) install -r requirements.txt
install:
$(PIP) install .

@ -1,20 +1,49 @@
# ocrd_repair_inconsistencies
Automatically re-order lines, words and glyphs to become textually consistent with their parents.
Automatically re-order lines, words and glyphs to become textually consistent with their parents.
## Introduction
PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates
in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency
between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text.
iff such re-ordering fixes the inconsistency between their appropriately concatenated
`TextEquiv` texts with their parent's `TextEquiv` text.
If `TextEquiv` is missing, skip the respective elements.
Where available, respect the annotated visual order:
- For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated.
(Both `left-to-right` and `right-to-left` will be skipped currently.)
- For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated.
(Both `top-to-bottom` and `bottom-to-top` will be skipped currently.)
This processor does not affect `ReadingOrder` between regions, just the order of the XML elements
below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`.
We wrote this as a one-shot script to fix some files. Use with caution.
## Installation
(In your venv, run:)
```sh
make deps # or pip install -r requirements.txt
make install # or pip install .
```
## Usage
Offers the following user interfaces:
### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies`
To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML)
documents in an [OCR-D](https://ocr-d.github.io) annotation workflow.
### Example
## Example usage
Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces,
and then replace it with the output on success:
For example, use this fix script:
~~~sh
#!/bin/bash
set -e

@ -1,4 +1,4 @@
import json
from pkg_resources import resource_string
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))

@ -6,6 +6,8 @@ from collections import Sequence
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
TextRegionType, TextLineType, WordType,
MetadataItemType, LabelsType, LabelType,
to_xml
)
from ocrd_utils import (
@ -33,7 +35,20 @@ class RepairInconsistencies(Processor):
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()
# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
regions = page.get_TextRegion()
for region in regions:
@ -49,7 +64,7 @@ class RepairInconsistencies(Processor):
page_id, region.id, textLineOrder)
continue
_fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top')
_fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top'))
lines = region.get_TextLine()
for line in lines:
@ -65,7 +80,7 @@ class RepairInconsistencies(Processor):
page_id, line.id, readingDirection)
continue
_fix_words(line, page_id, reverse=readingDirection=='right-to-left')
_fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left'))
words = line.get_Word()
for word in words:
@ -81,7 +96,7 @@ class RepairInconsistencies(Processor):
page_id, word.id, readingDirection)
continue
_fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left')
_fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left'))
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID:
@ -95,91 +110,66 @@ class RepairInconsistencies(Processor):
content=to_xml(pcgts))
def get_text(thing, joiner=None):
def get_text(thing, joiner=''):
"""Get the text of the given thing, joining if necessary"""
def _get_text_for_one(t):
if len(t.get_TextEquiv()) != 1:
raise NotImplementedError
def _get_text_for_one(one):
try:
return t.get_TextEquiv()[0].get_Unicode()
return one.get_TextEquiv()[0].get_Unicode()
except Exception:
LOG.warning('element "%s" has no text', one.id)
return None
if isinstance(thing, Sequence):
text = joiner.join(_get_text_for_one(t) for t in thing)
texts = [_get_text_for_one(part) for part in thing]
if all(texts):
return joiner.join(texts)
return None
return _get_text_for_one(thing)
def _fix_segment(segment, page_id, reverse=False):
"""Fix order of child elements of (region/line/word) segment."""
if isinstance(segment, TextRegionType):
joiner = '\n'
sort_horizontal = False
children = segment.get_TextLine()
adoption = segment.set_TextLine
elif isinstance(segment, TextLineType):
joiner = ' '
sort_horizontal = True
children = segment.get_Word()
adoption = segment.set_Word
elif isinstance(segment, WordType):
joiner = ''
sort_horizontal = True
children = segment.get_Glyph()
adoption = segment.set_Glyph
else:
text = _get_text_for_one(thing)
return text
def _fix_words(line, page_id, reverse=False):
"""Fix word order in a line"""
words = line.get_Word()
if not words:
return
line_text = get_text(line)
words_text = get_text(words, ' ')
if line_text != words_text:
sorted_words = sorted(words, reverse=reverse,
key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
sorted_words_text = get_text(sorted_words, ' ')
if sorted_words_text == line_text:
LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
line.set_Word(sorted_words)
else:
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
page_id, line.id,
str([word.id for word in words]),
str([word.id for word in sorted_words]),
words_text, line_text)
def _fix_glyphs(word, page_id, reverse=False):
"""Fix glyph order in a word"""
glyphs = word.get_Glyph()
if not glyphs:
return
word_text = get_text(word)
glyphs_text = get_text(glyphs, '')
if word_text != glyphs_text:
sorted_glyphs = sorted(glyphs, reverse=reverse,
key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
sorted_glyphs_text = get_text(sorted_glyphs, '')
if sorted_glyphs_text == word_text:
LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
word.set_Glyph(sorted_glyphs)
else:
LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"',
page_id, word.id,
str([glyph.id for glyph in glyphs]),
str([glyph.id for glyph in sorted_glyphs]),
glyphs_text, word_text)
def _fix_lines(region, page_id, reverse=False):
"""Fix line order in a region"""
lines = region.get_TextLine()
if not lines:
raise Exception('invalid element type %s of segment to fix' % type(segment))
if not children:
return
region_text = get_text(region)
lines_text = get_text(lines, '\n')
if region_text != lines_text:
sorted_lines = sorted(lines, reverse=reverse,
key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
sorted_lines_text = get_text(sorted_lines, '\n')
if sorted_lines_text == region_text:
LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
region.set_TextLine(sorted_lines)
segment_text = get_text(segment)
concat_text = get_text(children, joiner)
if (segment_text and concat_text and
segment_text != concat_text and
segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
def polygon_position(child, horizontal=sort_horizontal):
polygon = Polygon(polygon_from_points(child.get_Coords().points))
if horizontal:
return polygon.centroid.x
return polygon.centroid.y
sorted_children = sorted(children, reverse=reverse, key=polygon_position)
sorted_concat_text = get_text(sorted_children, joiner)
if (segment_text == sorted_concat_text or
segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')):
LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id)
adoption(sorted_children)
else:
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
page_id, region.id,
str([line.id for line in lines]),
str([line.id for line in sorted_lines]),
lines_text, region_text)
LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' +
'does not suffice to turn "%s" into "%s"',
page_id, segment.id,
str([seg.id for seg in children]),
str([seg.id for seg in sorted_children]),
concat_text, segment_text)

@ -6,7 +6,8 @@ from setuptools import setup, find_packages
setup(
name='ocrd_repair_inconsistencies',
description='Repair glyph/word/line order inconsistencies',
#long_description=codecs.open('README.md', encoding='utf-8').read(),
long_description=codecs.open('README.md', encoding='utf-8').read(),
long_description_content_type='text/markdown',
author='Mike Gerber',
author_email='mike.gerber@sbb.spk-berlin.de',
license='Apache License 2.0',

Loading…
Cancel
Save