Merge pull request #6 from bertsky/relax-concat-test

Relax concatenation test, backout gracefully
pull/10/head
Mike Gerber 5 years ago committed by GitHub
commit 594d937ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

1
.gitignore vendored

@ -104,5 +104,6 @@ venv.bak/
.mypy_cache/ .mypy_cache/
# vim tmp # vim tmp
*~
*.swp *.swp
*.swo *.swo

@ -0,0 +1,24 @@
[MASTER]
[MESSAGES CONTROL]
disable =
ungrouped-imports,
bad-continuation,
missing-docstring,
no-self-use,
superfluous-parens,
invalid-name,
line-too-long,
too-many-arguments,
too-many-branches,
too-many-statements,
too-many-locals,
too-few-public-methods,
wrong-import-order,
duplicate-code
# allow indented whitespace (as required by interpreter):
no-space-check=empty-line
# allow non-snake-case identifiers:
good-names=n,i

@ -0,0 +1,21 @@
SHELL = /bin/bash
PYTHON = python3
PIP = pip3
define HELP
cat <<EOF
ocrd_repair_inconsistencies
Targets:
deps Install Python dependencies via pip
install Install Python package
EOF
endef
export HELP
help: ; @eval "$$HELP"
deps:
$(PIP) install -r requirements.txt
install:
$(PIP) install .

@ -2,19 +2,48 @@
Automatically re-order lines, words and glyphs to become textually consistent with their parents. Automatically re-order lines, words and glyphs to become textually consistent with their parents.
## Introduction
PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates
in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency iff such re-ordering fixes the inconsistency between their appropriately concatenated
between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text. `TextEquiv` texts with their parent's `TextEquiv` text.
If `TextEquiv` is missing, skip the respective elements.
Where available, respect the annotated visual order:
- For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated.
(Both `left-to-right` and `right-to-left` will be skipped currently.)
- For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated.
(Both `top-to-bottom` and `bottom-to-top` will be skipped currently.)
This processor does not affect `ReadingOrder` between regions, just the order of the XML elements This processor does not affect `ReadingOrder` between regions, just the order of the XML elements
below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`. below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`.
We wrote this as a one-shot script to fix some files. Use with caution. We wrote this as a one-shot script to fix some files. Use with caution.
## Installation
(In your venv, run:)
```sh
make deps # or pip install -r requirements.txt
make install # or pip install .
```
## Usage
Offers the following user interfaces:
### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies`
To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML)
documents in an [OCR-D](https://ocr-d.github.io) annotation workflow.
### Example
## Example usage Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces,
and then replace it with the output on success:
For example, use this fix script:
~~~sh ~~~sh
#!/bin/bash #!/bin/bash
set -e set -e

@ -6,6 +6,8 @@ from collections import Sequence
from ocrd import Processor from ocrd import Processor
from ocrd_modelfactory import page_from_file from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import ( from ocrd_models.ocrd_page import (
TextRegionType, TextLineType, WordType,
MetadataItemType, LabelsType, LabelType,
to_xml to_xml
) )
from ocrd_utils import ( from ocrd_utils import (
@ -34,6 +36,19 @@ class RepairInconsistencies(Processor):
pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page() page = pcgts.get_Page()
# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
regions = page.get_TextRegion() regions = page.get_TextRegion()
for region in regions: for region in regions:
@ -49,7 +64,7 @@ class RepairInconsistencies(Processor):
page_id, region.id, textLineOrder) page_id, region.id, textLineOrder)
continue continue
_fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top') _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top'))
lines = region.get_TextLine() lines = region.get_TextLine()
for line in lines: for line in lines:
@ -65,7 +80,7 @@ class RepairInconsistencies(Processor):
page_id, line.id, readingDirection) page_id, line.id, readingDirection)
continue continue
_fix_words(line, page_id, reverse=readingDirection=='right-to-left') _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left'))
words = line.get_Word() words = line.get_Word()
for word in words: for word in words:
@ -81,7 +96,7 @@ class RepairInconsistencies(Processor):
page_id, word.id, readingDirection) page_id, word.id, readingDirection)
continue continue
_fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left') _fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left'))
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
if file_id == input_file.ID: if file_id == input_file.ID:
@ -95,91 +110,66 @@ class RepairInconsistencies(Processor):
content=to_xml(pcgts)) content=to_xml(pcgts))
def get_text(thing, joiner=None): def get_text(thing, joiner=''):
"""Get the text of the given thing, joining if necessary""" """Get the text of the given thing, joining if necessary"""
def _get_text_for_one(t): def _get_text_for_one(one):
if len(t.get_TextEquiv()) != 1:
raise NotImplementedError
try: try:
return t.get_TextEquiv()[0].get_Unicode() return one.get_TextEquiv()[0].get_Unicode()
except Exception: except Exception:
LOG.warning('element "%s" has no text', one.id)
return None return None
if isinstance(thing, Sequence): if isinstance(thing, Sequence):
text = joiner.join(_get_text_for_one(t) for t in thing) texts = [_get_text_for_one(part) for part in thing]
else: if all(texts):
text = _get_text_for_one(thing) return joiner.join(texts)
return text return None
return _get_text_for_one(thing)
def _fix_words(line, page_id, reverse=False): def _fix_segment(segment, page_id, reverse=False):
"""Fix word order in a line""" """Fix order of child elements of (region/line/word) segment."""
words = line.get_Word() if isinstance(segment, TextRegionType):
if not words: joiner = '\n'
return sort_horizontal = False
line_text = get_text(line) children = segment.get_TextLine()
words_text = get_text(words, ' ') adoption = segment.set_TextLine
if line_text != words_text: elif isinstance(segment, TextLineType):
sorted_words = sorted(words, reverse=reverse, joiner = ' '
key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) sort_horizontal = True
sorted_words_text = get_text(sorted_words, ' ') children = segment.get_Word()
adoption = segment.set_Word
if sorted_words_text == line_text: elif isinstance(segment, WordType):
LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) joiner = ''
line.set_Word(sorted_words) sort_horizontal = True
else: children = segment.get_Glyph()
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', adoption = segment.set_Glyph
page_id, line.id,
str([word.id for word in words]),
str([word.id for word in sorted_words]),
words_text, line_text)
def _fix_glyphs(word, page_id, reverse=False):
"""Fix glyph order in a word"""
glyphs = word.get_Glyph()
if not glyphs:
return
word_text = get_text(word)
glyphs_text = get_text(glyphs, '')
if word_text != glyphs_text:
sorted_glyphs = sorted(glyphs, reverse=reverse,
key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
sorted_glyphs_text = get_text(sorted_glyphs, '')
if sorted_glyphs_text == word_text:
LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
word.set_Glyph(sorted_glyphs)
else: else:
LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"', raise Exception('invalid element type %s of segment to fix' % type(segment))
page_id, word.id, if not children:
str([glyph.id for glyph in glyphs]),
str([glyph.id for glyph in sorted_glyphs]),
glyphs_text, word_text)
def _fix_lines(region, page_id, reverse=False):
"""Fix line order in a region"""
lines = region.get_TextLine()
if not lines:
return return
region_text = get_text(region) segment_text = get_text(segment)
lines_text = get_text(lines, '\n') concat_text = get_text(children, joiner)
if region_text != lines_text: if (segment_text and concat_text and
sorted_lines = sorted(lines, reverse=reverse, segment_text != concat_text and
key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
sorted_lines_text = get_text(sorted_lines, '\n') def polygon_position(child, horizontal=sort_horizontal):
polygon = Polygon(polygon_from_points(child.get_Coords().points))
if sorted_lines_text == region_text: if horizontal:
LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) return polygon.centroid.x
region.set_TextLine(sorted_lines) return polygon.centroid.y
sorted_children = sorted(children, reverse=reverse, key=polygon_position)
sorted_concat_text = get_text(sorted_children, joiner)
if (segment_text == sorted_concat_text or
segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')):
LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id)
adoption(sorted_children)
else: else:
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' +
page_id, region.id, 'does not suffice to turn "%s" into "%s"',
str([line.id for line in lines]), page_id, segment.id,
str([line.id for line in sorted_lines]), str([seg.id for seg in children]),
lines_text, region_text) str([seg.id for seg in sorted_children]),
concat_text, segment_text)

@ -6,7 +6,8 @@ from setuptools import setup, find_packages
setup( setup(
name='ocrd_repair_inconsistencies', name='ocrd_repair_inconsistencies',
description='Repair glyph/word/line order inconsistencies', description='Repair glyph/word/line order inconsistencies',
#long_description=codecs.open('README.md', encoding='utf-8').read(), long_description=codecs.open('README.md', encoding='utf-8').read(),
long_description_content_type='text/markdown',
author='Mike Gerber', author='Mike Gerber',
author_email='mike.gerber@sbb.spk-berlin.de', author_email='mike.gerber@sbb.spk-berlin.de',
license='Apache License 2.0', license='Apache License 2.0',

Loading…
Cancel
Save