mirror of
https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
synced 2025-06-26 12:09:56 +02:00
Merge pull request #6 from bertsky/relax-concat-test
Relax concatenation test, backout gracefully
This commit is contained in:
commit
594d937ef5
7 changed files with 156 additions and 90 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -104,5 +104,6 @@ venv.bak/
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|
||||||
# vim tmp
|
# vim tmp
|
||||||
|
*~
|
||||||
*.swp
|
*.swp
|
||||||
*.swo
|
*.swo
|
||||||
|
|
24
.pylintrc
Normal file
24
.pylintrc
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
[MASTER]
|
||||||
|
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
disable =
|
||||||
|
ungrouped-imports,
|
||||||
|
bad-continuation,
|
||||||
|
missing-docstring,
|
||||||
|
no-self-use,
|
||||||
|
superfluous-parens,
|
||||||
|
invalid-name,
|
||||||
|
line-too-long,
|
||||||
|
too-many-arguments,
|
||||||
|
too-many-branches,
|
||||||
|
too-many-statements,
|
||||||
|
too-many-locals,
|
||||||
|
too-few-public-methods,
|
||||||
|
wrong-import-order,
|
||||||
|
duplicate-code
|
||||||
|
|
||||||
|
# allow indented whitespace (as required by interpreter):
|
||||||
|
no-space-check=empty-line
|
||||||
|
|
||||||
|
# allow non-snake-case identifiers:
|
||||||
|
good-names=n,i
|
21
Makefile
Normal file
21
Makefile
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
SHELL = /bin/bash
|
||||||
|
PYTHON = python3
|
||||||
|
PIP = pip3
|
||||||
|
|
||||||
|
define HELP
|
||||||
|
cat <<EOF
|
||||||
|
ocrd_repair_inconsistencies
|
||||||
|
|
||||||
|
Targets:
|
||||||
|
deps Install Python dependencies via pip
|
||||||
|
install Install Python package
|
||||||
|
EOF
|
||||||
|
endef
|
||||||
|
export HELP
|
||||||
|
help: ; @eval "$$HELP"
|
||||||
|
|
||||||
|
deps:
|
||||||
|
$(PIP) install -r requirements.txt
|
||||||
|
|
||||||
|
install:
|
||||||
|
$(PIP) install .
|
37
README.md
37
README.md
|
@ -2,19 +2,48 @@
|
||||||
|
|
||||||
Automatically re-order lines, words and glyphs to become textually consistent with their parents.
|
Automatically re-order lines, words and glyphs to become textually consistent with their parents.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates
|
PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates
|
||||||
in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency
|
iff such re-ordering fixes the inconsistency between their appropriately concatenated
|
||||||
between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text.
|
`TextEquiv` texts with their parent's `TextEquiv` text.
|
||||||
|
|
||||||
|
If `TextEquiv` is missing, skip the respective elements.
|
||||||
|
|
||||||
|
Where available, respect the annotated visual order:
|
||||||
|
- For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated.
|
||||||
|
(Both `left-to-right` and `right-to-left` will be skipped currently.)
|
||||||
|
- For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated.
|
||||||
|
(Both `top-to-bottom` and `bottom-to-top` will be skipped currently.)
|
||||||
|
|
||||||
This processor does not affect `ReadingOrder` between regions, just the order of the XML elements
|
This processor does not affect `ReadingOrder` between regions, just the order of the XML elements
|
||||||
below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`.
|
below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`.
|
||||||
|
|
||||||
We wrote this as a one-shot script to fix some files. Use with caution.
|
We wrote this as a one-shot script to fix some files. Use with caution.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
## Example usage
|
(In your venv, run:)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
make deps # or pip install -r requirements.txt
|
||||||
|
make install # or pip install .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Offers the following user interfaces:
|
||||||
|
|
||||||
|
### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies`
|
||||||
|
|
||||||
|
To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML)
|
||||||
|
documents in an [OCR-D](https://ocr-d.github.io) annotation workflow.
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces,
|
||||||
|
and then replace it with the output on success:
|
||||||
|
|
||||||
For example, use this fix script:
|
|
||||||
~~~sh
|
~~~sh
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -e
|
set -e
|
||||||
|
|
|
@ -6,6 +6,8 @@ from collections import Sequence
|
||||||
from ocrd import Processor
|
from ocrd import Processor
|
||||||
from ocrd_modelfactory import page_from_file
|
from ocrd_modelfactory import page_from_file
|
||||||
from ocrd_models.ocrd_page import (
|
from ocrd_models.ocrd_page import (
|
||||||
|
TextRegionType, TextLineType, WordType,
|
||||||
|
MetadataItemType, LabelsType, LabelType,
|
||||||
to_xml
|
to_xml
|
||||||
)
|
)
|
||||||
from ocrd_utils import (
|
from ocrd_utils import (
|
||||||
|
@ -34,6 +36,19 @@ class RepairInconsistencies(Processor):
|
||||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||||
page = pcgts.get_Page()
|
page = pcgts.get_Page()
|
||||||
|
|
||||||
|
# add metadata about this operation and its runtime parameters:
|
||||||
|
metadata = pcgts.get_Metadata() # ensured by from_file()
|
||||||
|
metadata.add_MetadataItem(
|
||||||
|
MetadataItemType(type_="processingStep",
|
||||||
|
name=self.ocrd_tool['steps'][0],
|
||||||
|
value=TOOL,
|
||||||
|
Labels=[LabelsType(
|
||||||
|
externalModel="ocrd-tool",
|
||||||
|
externalId="parameters",
|
||||||
|
Label=[LabelType(type_=name,
|
||||||
|
value=self.parameter[name])
|
||||||
|
for name in self.parameter.keys()])]))
|
||||||
|
|
||||||
regions = page.get_TextRegion()
|
regions = page.get_TextRegion()
|
||||||
|
|
||||||
for region in regions:
|
for region in regions:
|
||||||
|
@ -49,7 +64,7 @@ class RepairInconsistencies(Processor):
|
||||||
page_id, region.id, textLineOrder)
|
page_id, region.id, textLineOrder)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
_fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top')
|
_fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top'))
|
||||||
|
|
||||||
lines = region.get_TextLine()
|
lines = region.get_TextLine()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
@ -65,7 +80,7 @@ class RepairInconsistencies(Processor):
|
||||||
page_id, line.id, readingDirection)
|
page_id, line.id, readingDirection)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
_fix_words(line, page_id, reverse=readingDirection=='right-to-left')
|
_fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left'))
|
||||||
|
|
||||||
words = line.get_Word()
|
words = line.get_Word()
|
||||||
for word in words:
|
for word in words:
|
||||||
|
@ -81,7 +96,7 @@ class RepairInconsistencies(Processor):
|
||||||
page_id, word.id, readingDirection)
|
page_id, word.id, readingDirection)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
_fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left')
|
_fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left'))
|
||||||
|
|
||||||
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
||||||
if file_id == input_file.ID:
|
if file_id == input_file.ID:
|
||||||
|
@ -95,91 +110,66 @@ class RepairInconsistencies(Processor):
|
||||||
content=to_xml(pcgts))
|
content=to_xml(pcgts))
|
||||||
|
|
||||||
|
|
||||||
def get_text(thing, joiner=None):
|
def get_text(thing, joiner=''):
|
||||||
"""Get the text of the given thing, joining if necessary"""
|
"""Get the text of the given thing, joining if necessary"""
|
||||||
|
|
||||||
def _get_text_for_one(t):
|
def _get_text_for_one(one):
|
||||||
if len(t.get_TextEquiv()) != 1:
|
|
||||||
raise NotImplementedError
|
|
||||||
try:
|
try:
|
||||||
return t.get_TextEquiv()[0].get_Unicode()
|
return one.get_TextEquiv()[0].get_Unicode()
|
||||||
except Exception:
|
except Exception:
|
||||||
|
LOG.warning('element "%s" has no text', one.id)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if isinstance(thing, Sequence):
|
if isinstance(thing, Sequence):
|
||||||
text = joiner.join(_get_text_for_one(t) for t in thing)
|
texts = [_get_text_for_one(part) for part in thing]
|
||||||
|
if all(texts):
|
||||||
|
return joiner.join(texts)
|
||||||
|
return None
|
||||||
|
return _get_text_for_one(thing)
|
||||||
|
|
||||||
|
def _fix_segment(segment, page_id, reverse=False):
|
||||||
|
"""Fix order of child elements of (region/line/word) segment."""
|
||||||
|
|
||||||
|
if isinstance(segment, TextRegionType):
|
||||||
|
joiner = '\n'
|
||||||
|
sort_horizontal = False
|
||||||
|
children = segment.get_TextLine()
|
||||||
|
adoption = segment.set_TextLine
|
||||||
|
elif isinstance(segment, TextLineType):
|
||||||
|
joiner = ' '
|
||||||
|
sort_horizontal = True
|
||||||
|
children = segment.get_Word()
|
||||||
|
adoption = segment.set_Word
|
||||||
|
elif isinstance(segment, WordType):
|
||||||
|
joiner = ''
|
||||||
|
sort_horizontal = True
|
||||||
|
children = segment.get_Glyph()
|
||||||
|
adoption = segment.set_Glyph
|
||||||
else:
|
else:
|
||||||
text = _get_text_for_one(thing)
|
raise Exception('invalid element type %s of segment to fix' % type(segment))
|
||||||
return text
|
if not children:
|
||||||
|
|
||||||
|
|
||||||
def _fix_words(line, page_id, reverse=False):
|
|
||||||
"""Fix word order in a line"""
|
|
||||||
|
|
||||||
words = line.get_Word()
|
|
||||||
if not words:
|
|
||||||
return
|
return
|
||||||
line_text = get_text(line)
|
segment_text = get_text(segment)
|
||||||
words_text = get_text(words, ' ')
|
concat_text = get_text(children, joiner)
|
||||||
if line_text != words_text:
|
if (segment_text and concat_text and
|
||||||
sorted_words = sorted(words, reverse=reverse,
|
segment_text != concat_text and
|
||||||
key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x)
|
segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
|
||||||
sorted_words_text = get_text(sorted_words, ' ')
|
def polygon_position(child, horizontal=sort_horizontal):
|
||||||
|
polygon = Polygon(polygon_from_points(child.get_Coords().points))
|
||||||
|
if horizontal:
|
||||||
|
return polygon.centroid.x
|
||||||
|
return polygon.centroid.y
|
||||||
|
sorted_children = sorted(children, reverse=reverse, key=polygon_position)
|
||||||
|
sorted_concat_text = get_text(sorted_children, joiner)
|
||||||
|
|
||||||
if sorted_words_text == line_text:
|
if (segment_text == sorted_concat_text or
|
||||||
LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id)
|
segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')):
|
||||||
line.set_Word(sorted_words)
|
LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id)
|
||||||
|
adoption(sorted_children)
|
||||||
else:
|
else:
|
||||||
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' +
|
||||||
page_id, line.id,
|
'does not suffice to turn "%s" into "%s"',
|
||||||
str([word.id for word in words]),
|
page_id, segment.id,
|
||||||
str([word.id for word in sorted_words]),
|
str([seg.id for seg in children]),
|
||||||
words_text, line_text)
|
str([seg.id for seg in sorted_children]),
|
||||||
|
concat_text, segment_text)
|
||||||
|
|
||||||
def _fix_glyphs(word, page_id, reverse=False):
|
|
||||||
"""Fix glyph order in a word"""
|
|
||||||
|
|
||||||
glyphs = word.get_Glyph()
|
|
||||||
if not glyphs:
|
|
||||||
return
|
|
||||||
word_text = get_text(word)
|
|
||||||
glyphs_text = get_text(glyphs, '')
|
|
||||||
if word_text != glyphs_text:
|
|
||||||
sorted_glyphs = sorted(glyphs, reverse=reverse,
|
|
||||||
key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
|
|
||||||
sorted_glyphs_text = get_text(sorted_glyphs, '')
|
|
||||||
|
|
||||||
if sorted_glyphs_text == word_text:
|
|
||||||
LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id)
|
|
||||||
word.set_Glyph(sorted_glyphs)
|
|
||||||
else:
|
|
||||||
LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
|
||||||
page_id, word.id,
|
|
||||||
str([glyph.id for glyph in glyphs]),
|
|
||||||
str([glyph.id for glyph in sorted_glyphs]),
|
|
||||||
glyphs_text, word_text)
|
|
||||||
|
|
||||||
|
|
||||||
def _fix_lines(region, page_id, reverse=False):
|
|
||||||
"""Fix line order in a region"""
|
|
||||||
|
|
||||||
lines = region.get_TextLine()
|
|
||||||
if not lines:
|
|
||||||
return
|
|
||||||
region_text = get_text(region)
|
|
||||||
lines_text = get_text(lines, '\n')
|
|
||||||
if region_text != lines_text:
|
|
||||||
sorted_lines = sorted(lines, reverse=reverse,
|
|
||||||
key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y)
|
|
||||||
sorted_lines_text = get_text(sorted_lines, '\n')
|
|
||||||
|
|
||||||
if sorted_lines_text == region_text:
|
|
||||||
LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id)
|
|
||||||
region.set_TextLine(sorted_lines)
|
|
||||||
else:
|
|
||||||
LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
|
||||||
page_id, region.id,
|
|
||||||
str([line.id for line in lines]),
|
|
||||||
str([line.id for line in sorted_lines]),
|
|
||||||
lines_text, region_text)
|
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -6,7 +6,8 @@ from setuptools import setup, find_packages
|
||||||
setup(
|
setup(
|
||||||
name='ocrd_repair_inconsistencies',
|
name='ocrd_repair_inconsistencies',
|
||||||
description='Repair glyph/word/line order inconsistencies',
|
description='Repair glyph/word/line order inconsistencies',
|
||||||
#long_description=codecs.open('README.md', encoding='utf-8').read(),
|
long_description=codecs.open('README.md', encoding='utf-8').read(),
|
||||||
|
long_description_content_type='text/markdown',
|
||||||
author='Mike Gerber',
|
author='Mike Gerber',
|
||||||
author_email='mike.gerber@sbb.spk-berlin.de',
|
author_email='mike.gerber@sbb.spk-berlin.de',
|
||||||
license='Apache License 2.0',
|
license='Apache License 2.0',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue