You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
164 lines
6.4 KiB
Python
164 lines
6.4 KiB
Python
from __future__ import absolute_import
|
|
|
|
import os.path
|
|
from collections import Sequence
|
|
|
|
from ocrd import Processor
|
|
from ocrd_modelfactory import page_from_file
|
|
from ocrd_models.ocrd_page import (
|
|
TextRegionType, TextLineType, WordType,
|
|
to_xml
|
|
)
|
|
from ocrd_utils import (
|
|
getLogger, concat_padded,
|
|
polygon_from_points,
|
|
MIMETYPE_PAGE
|
|
)
|
|
from shapely.geometry import Polygon
|
|
|
|
from .config import OCRD_TOOL
|
|
|
|
TOOL = 'ocrd-repair-inconsistencies'
|
|
LOG = getLogger('processor.RepairInconsistencies')
|
|
|
|
|
|
class RepairInconsistencies(Processor):
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
|
super(RepairInconsistencies, self).__init__(*args, **kwargs)
|
|
|
|
def process(self):
|
|
for (n, input_file) in enumerate(self.input_files):
|
|
page_id = input_file.pageId or input_file.ID
|
|
LOG.info("INPUT FILE %i / %s", n, page_id)
|
|
pcgts = page_from_file(self.workspace.download_file(input_file))
|
|
page = pcgts.get_Page()
|
|
|
|
regions = page.get_TextRegion()
|
|
|
|
for region in regions:
|
|
textLineOrder = 'top-to-bottom'
|
|
for segment in [region, page]:
|
|
if segment.textLineOrder is None:
|
|
continue
|
|
else:
|
|
textLineOrder = segment.textLineOrder
|
|
break
|
|
if textLineOrder not in ['top-to-bottom', 'bottom-to-top']:
|
|
LOG.info('Not processing page "%s" region "%s" (textLineOrder=%s)',
|
|
page_id, region.id, textLineOrder)
|
|
continue
|
|
|
|
_fix_segment(region, page_id, reverse=textLineOrder=='bottom-to-top')
|
|
|
|
lines = region.get_TextLine()
|
|
for line in lines:
|
|
readingDirection = 'left-to-right'
|
|
for segment in [line, region, page]:
|
|
if segment.readingDirection is None:
|
|
continue
|
|
else:
|
|
readingDirection = segment.readingDirection
|
|
break
|
|
if readingDirection not in ['left-to-right', 'right-to-left']:
|
|
LOG.info('Not processing page "%s" line "%s" (readingDirection=%s)',
|
|
page_id, line.id, readingDirection)
|
|
continue
|
|
|
|
_fix_segment(line, page_id, reverse=readingDirection=='right-to-left')
|
|
|
|
words = line.get_Word()
|
|
for word in words:
|
|
readingDirection = 'left-to-right'
|
|
for segment in [word, line, region, page]:
|
|
if segment.readingDirection is None:
|
|
continue
|
|
else:
|
|
readingDirection = segment.readingDirection
|
|
break
|
|
if readingDirection not in ['left-to-right', 'right-to-left']:
|
|
LOG.info('Not processing page "%s" word "%s" (readingDirection=%s)',
|
|
page_id, word.id, readingDirection)
|
|
continue
|
|
|
|
_fix_segment(word, page_id, reverse=readingDirection=='right-to-left')
|
|
|
|
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
|
if file_id == input_file.ID:
|
|
file_id = concat_padded(self.output_file_grp, n)
|
|
self.workspace.add_file(
|
|
ID=file_id,
|
|
file_grp=self.output_file_grp,
|
|
pageId=input_file.pageId,
|
|
mimetype=MIMETYPE_PAGE,
|
|
local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
|
|
content=to_xml(pcgts))
|
|
|
|
|
|
def get_text(thing, joiner=''):
|
|
"""Get the text of the given thing, joining if necessary"""
|
|
|
|
def _get_text_for_one(one):
|
|
try:
|
|
return one.get_TextEquiv()[0].get_Unicode()
|
|
except Exception:
|
|
LOG.warning('element "%s" has no text', one.id)
|
|
return None
|
|
|
|
if isinstance(thing, Sequence):
|
|
texts = [_get_text_for_one(part) for part in thing]
|
|
if all(texts):
|
|
return joiner.join(texts)
|
|
else:
|
|
return None
|
|
else:
|
|
return _get_text_for_one(thing)
|
|
|
|
def _fix_segment(segment, page_id, reverse=False):
|
|
"""Fix order of child elements of (region/line/word) segment."""
|
|
|
|
if isinstance(segment, TextRegionType):
|
|
joiner = '\n'
|
|
sort_horizontal = False
|
|
children = segment.get_TextLine()
|
|
adoption = segment.set_TextLine
|
|
elif isinstance(segment, TextLineType):
|
|
joiner = ' '
|
|
sort_horizontal = True
|
|
children = segment.get_Word()
|
|
adoption = segment.set_Word
|
|
elif isinstance(segment, WordType):
|
|
joiner = ''
|
|
sort_horizontal = True
|
|
children = segment.get_Glyph()
|
|
adoption = segment.set_Glyph
|
|
else:
|
|
raise Exception('invalid element type %s of segment to fix' % type(segment))
|
|
if not children:
|
|
return
|
|
segment_text = get_text(segment)
|
|
concat_text = get_text(children, joiner)
|
|
if (segment_text and concat_text and
|
|
segment_text != concat_text and
|
|
segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
|
|
def polygon_position(child, horizontal=sort_horizontal):
|
|
polygon = Polygon(polygon_from_points(child.get_Coords().points))
|
|
if horizontal:
|
|
return polygon.centroid.x
|
|
else:
|
|
return polygon.centroid.y
|
|
sorted_children = sorted(children, reverse=reverse, key=polygon_position)
|
|
sorted_concat_text = get_text(sorted_children, joiner)
|
|
|
|
if (segment_text == sorted_concat_text or
|
|
segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')):
|
|
LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id)
|
|
adoption(sorted_children)
|
|
else:
|
|
LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s does not suffice to turn "%s" into "%s"',
|
|
page_id, segment.id,
|
|
str([seg.id for seg in children]),
|
|
str([seg.id for seg in sorted_children]),
|
|
concat_text, segment_text)
|