mirror of
https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
synced 2025-06-08 19:29:59 +02:00
backout gracefully when text annotation is missing
This commit is contained in:
parent
9002606e1c
commit
ad8f25666b
1 changed files with 13 additions and 11 deletions
|
@ -96,23 +96,24 @@ class RepairInconsistencies(Processor):
|
||||||
content=to_xml(pcgts))
|
content=to_xml(pcgts))
|
||||||
|
|
||||||
|
|
||||||
def get_text(thing, joiner=None):
|
def get_text(thing, joiner=''):
|
||||||
"""Get the text of the given thing, joining if necessary"""
|
"""Get the text of the given thing, joining if necessary"""
|
||||||
|
|
||||||
def _get_text_for_one(t):
|
def _get_text_for_one(one):
|
||||||
if len(t.get_TextEquiv()) != 1:
|
|
||||||
raise NotImplementedError
|
|
||||||
try:
|
try:
|
||||||
return t.get_TextEquiv()[0].get_Unicode()
|
return one.get_TextEquiv()[0].get_Unicode()
|
||||||
except Exception:
|
except Exception:
|
||||||
|
LOG.warning('element "%s" has no text', one.id)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if isinstance(thing, Sequence):
|
if isinstance(thing, Sequence):
|
||||||
text = joiner.join(_get_text_for_one(t) for t in thing)
|
texts = [_get_text_for_one(part) for part in thing]
|
||||||
|
if all(texts):
|
||||||
|
return joiner.join(texts)
|
||||||
else:
|
else:
|
||||||
text = _get_text_for_one(thing)
|
return None
|
||||||
return text
|
else:
|
||||||
|
return _get_text_for_one(thing)
|
||||||
|
|
||||||
def _fix_segment(segment, page_id, reverse=False):
|
def _fix_segment(segment, page_id, reverse=False):
|
||||||
"""Fix order of child elements of (region/line/word) segment."""
|
"""Fix order of child elements of (region/line/word) segment."""
|
||||||
|
@ -138,7 +139,8 @@ def _fix_segment(segment, page_id, reverse=False):
|
||||||
return
|
return
|
||||||
segment_text = get_text(segment)
|
segment_text = get_text(segment)
|
||||||
concat_text = get_text(children, joiner)
|
concat_text = get_text(children, joiner)
|
||||||
if (segment_text != concat_text and
|
if (segment_text and concat_text and
|
||||||
|
segment_text != concat_text and
|
||||||
segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
|
segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
|
||||||
def polygon_position(child, horizontal=sort_horizontal):
|
def polygon_position(child, horizontal=sort_horizontal):
|
||||||
polygon = Polygon(polygon_from_points(child.get_Coords().points))
|
polygon = Polygon(polygon_from_points(child.get_Coords().points))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue