mirror of
https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
synced 2025-06-08 19:29:59 +02:00
backout gracefully when text annotation is missing
This commit is contained in:
parent
9002606e1c
commit
ad8f25666b
1 changed files with 13 additions and 11 deletions
|
@ -96,23 +96,24 @@ class RepairInconsistencies(Processor):
|
|||
content=to_xml(pcgts))
|
||||
|
||||
|
||||
def get_text(thing, joiner=None):
|
||||
def get_text(thing, joiner=''):
|
||||
"""Get the text of the given thing, joining if necessary"""
|
||||
|
||||
def _get_text_for_one(t):
|
||||
if len(t.get_TextEquiv()) != 1:
|
||||
raise NotImplementedError
|
||||
def _get_text_for_one(one):
|
||||
try:
|
||||
return t.get_TextEquiv()[0].get_Unicode()
|
||||
return one.get_TextEquiv()[0].get_Unicode()
|
||||
except Exception:
|
||||
LOG.warning('element "%s" has no text', one.id)
|
||||
return None
|
||||
|
||||
if isinstance(thing, Sequence):
|
||||
text = joiner.join(_get_text_for_one(t) for t in thing)
|
||||
texts = [_get_text_for_one(part) for part in thing]
|
||||
if all(texts):
|
||||
return joiner.join(texts)
|
||||
else:
|
||||
text = _get_text_for_one(thing)
|
||||
return text
|
||||
|
||||
return None
|
||||
else:
|
||||
return _get_text_for_one(thing)
|
||||
|
||||
def _fix_segment(segment, page_id, reverse=False):
|
||||
"""Fix order of child elements of (region/line/word) segment."""
|
||||
|
@ -138,7 +139,8 @@ def _fix_segment(segment, page_id, reverse=False):
|
|||
return
|
||||
segment_text = get_text(segment)
|
||||
concat_text = get_text(children, joiner)
|
||||
if (segment_text != concat_text and
|
||||
if (segment_text and concat_text and
|
||||
segment_text != concat_text and
|
||||
segment_text.replace(joiner, '') != concat_text.replace(joiner, '')):
|
||||
def polygon_position(child, horizontal=sort_horizontal):
|
||||
polygon = Polygon(polygon_from_points(child.get_Coords().points))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue