diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 46c8fec..916b123 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -4,9 +4,13 @@ import unicodedata
from contextlib import suppress
from itertools import repeat
from typing import Optional
-from lxml import etree as ET
import attr
+import numpy as np
+from lxml import etree as ET
+from ocrd_utils import getLogger
+
+LOG = getLogger('processor.OcrdDinglehopperEvaluate')
class Normalization(enum.Enum):
@@ -47,15 +51,17 @@ def unjoin_ligatures(s):
'fl': 'fl',
'ffi': 'ffi',
'': 'ct',
- '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
+ '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
- 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
- '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
+ 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
+ '\uE8BF': 'q&',
+ # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
+ # XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
- 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
+ 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
@@ -70,14 +76,14 @@ def substitute_equivalences(s):
'': 'ü',
'': 'ä',
'==': '–', # → en-dash
- '—': '–', # em-dash → en-dash
+ '—': '–', # em-dash → en-dash
'': 'ö',
'’': '\'',
'⸗': '-',
- 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
- 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
- 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
- '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
+ 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+ 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+ 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
+ '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
@@ -178,27 +184,6 @@ class ExtractedText:
def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
"""Build an ExtractedText from a PAGE content text element"""
- def invert_dict(d):
- """Invert the given dict"""
- return {v: k for k, v in d.items()}
-
- def get_textequiv_unicode(s):
- """Get the TextEquiv/Unicode text of the given PAGE text element"""
- textequivs = s.findall('./page:TextEquiv', namespaces=nsmap)
-
- if not textequivs:
- return None
-
- def get_index(te):
- index = te.attrib.get('index')
- try:
- return int(index)
- except TypeError:
- return None
- textequivs = sorted(textequivs, key=get_index)
-
- return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
-
localname_for_textequiv_level = {
'region': 'TextRegion',
'line': 'TextLine'
@@ -216,9 +201,9 @@ class ExtractedText:
if localname == localname_for_textequiv_level[textequiv_level]:
segment_text = None
with suppress(AttributeError):
- segment_text = get_textequiv_unicode(text_segment)
- segment_text = segment_text or ''
- segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
+ segment_text = get_textequiv_unicode(text_segment, nsmap)
+ # FIXME hardcoded SBB normalization
+ segment_text = normalize_sbb(segment_text)
segment_text = segment_text or ''
return cls(segment_id, None, None, segment_text)
else:
@@ -226,17 +211,73 @@ class ExtractedText:
sub_localname = children_for_localname[localname]
sub_textequiv_level = textequiv_level_for_localname[sub_localname]
segments = []
- for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap):
+ for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
+ namespaces=nsmap):
segments.append(
- ExtractedText.from_text_segment(
- sub_segment, nsmap,
- textequiv_level=sub_textequiv_level)
+ ExtractedText.from_text_segment(
+ sub_segment, nsmap,
+ textequiv_level=sub_textequiv_level)
)
joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None)
-
@classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization)
+
+
+def invert_dict(d):
+ """Invert the given dict."""
+ return {v: k for k, v in d.items()}
+
+
+def get_textequiv_unicode(text_segment, nsmap) -> str:
+ """Get the TextEquiv/Unicode text of the given PAGE text element."""
+ segment_id = text_segment.attrib['id']
+ textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
+
+ if not textequivs:
+ return ''
+
+ textequiv = get_first_textequiv(textequivs, segment_id)
+ return textequiv.find('./page:Unicode', namespaces=nsmap).text or ''
+
+
+def get_first_textequiv(textequivs, segment_id):
+ """Get the first TextEquiv based on index or conf order if index is not present."""
+ if len(textequivs) == 1:
+ return textequivs[0]
+
+ # try ordering by index
+ indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
+ nan_mask = np.isnan(indices)
+ if np.any(~nan_mask):
+ if np.any(nan_mask):
+ LOG.warning("TextEquiv without index in %s.", segment_id)
+ index = np.nanargmin(indices)
+ else:
+ # try ordering by conf
+ confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
+ if np.any(~np.isnan(confidences)):
+ LOG.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
+ segment_id)
+ index = np.nanargmax(confidences)
+ else:
+ # fallback to first entry in case of neither index or conf present
+ LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
+ index = 0
+ return textequivs[index]
+
+
+def get_attr(te, attr_name) -> float:
+ """Extract the attribute for the given name.
+
+ Note: currently only handles numeric values!
+ Other or non existend values are encoded as np.nan.
+ """
+ attr_value = te.attrib.get(attr_name)
+ try:
+ return float(attr_value)
+ except TypeError:
+ return np.nan
diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py
index 0d59c99..2ce81cd 100644
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@@ -1,8 +1,10 @@
+import logging
import unicodedata
-import pytest
-from uniseg.graphemecluster import grapheme_clusters
from collections import namedtuple
+
+import pytest
from lxml import etree as ET
+from uniseg.graphemecluster import grapheme_clusters
from .. import seq_align, ExtractedText
@@ -45,12 +47,17 @@ def test_align():
test2 = ExtractedText(None, [
ExtractedText('x0', None, None, 'foo'),
ExtractedText('x1', None, None, 'bar'),
- ExtractedText('x2', None, None, '.'), # extra .
- ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters
+ # extra .
+ ExtractedText('x2', None, None, '.'),
+ # deletion + different grapheme cluster, m̃ also is two Python characters
+ ExtractedText('x3', None, None, 'bazim̃ga'),
], ' ', None)
- left_pos = 0; right_pos = 0; alignment = []
- for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
+ left_pos = 0
+ right_pos = 0
+ alignment = []
+ for left, right in seq_align(grapheme_clusters(test1.text),
+ grapheme_clusters(test2.text)):
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
el = AlignmentElement(left, right, left_id, right_id)
@@ -63,33 +70,49 @@ def test_align():
print('test1: {}'.format(test1.text))
print('test2: {}'.format(test2.text))
- assert alignment[0] == ('f', 'f', 's0', 'x0')
- assert alignment[8] == (None, '.', None, 'x2')
- assert alignment[12] == ('t', None, 's2', None)
- assert alignment[15] == ('n', 'm̃', 's2', 'x3')
-
+ assert alignment[0] == ('f', 'f', 's0', 'x0')
+ assert alignment[8] == (None, '.', None, 'x2')
+ assert alignment[12] == ('t', None, 's2', None)
+ assert alignment[15] == ('n', 'm̃', 's2', 'x3')
+
+
+@pytest.mark.parametrize("attributes,expected_index,expected_log", [
+ ([], None, None),
+ (['index="0"'], 0, None),
+ ([''], 0, None),
+ (['conf="0.5"'], 0, None),
+ (['index="1"', 'index="0"'], 1, None),
+ (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
+ (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
+ "No index attributes, use 'conf' attribute to sort TextEquiv"),
+ (['index="0"', ''], 0, "TextEquiv without index"),
+ (['', 'conf="0.4"'], 1,
+ "No index attributes, use 'conf' attribute to sort TextEquiv"),
+ (['', ''], 0, "No index attributes, use first TextEquiv"),
+])
+def test_textequiv(attributes, expected_index, expected_log, caplog):
+ """Test that extracting text from a PAGE TextEquiv is working without index attr."""
+ caplog.set_level(logging.INFO)
+ xml = ""
+ ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
+ text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
+
+ equiv = ["{1}".format(attr, text[i])
+ for i, attr in enumerate(attributes)]
+
+ textline = "{0}{2}"
+ textline = textline.format(xml, ns, ''.join(equiv))
-def test_textequiv_index():
- """
- Test that extracting text from a PAGE TextEquiv honors the "index".
- """
-
- # This example textline has two TextEquivs, the one with the lowest index
- # should be used. The XML order of the TextEquivs is deliberately not
- # in index order.
- textline="""
-
-
- gefahren zu haben, einzelne Bemorkungen und Beobäch-
-
-
- gefahren zu haben, einzelne Bemerkungen und Beobach-
-
-
- """
root = ET.fromstring(textline)
- nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
- result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
- expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
-
- assert expected == result
+ result = ExtractedText.from_text_segment(root,
+ {'page': ns},
+ textequiv_level='line').text
+ if expected_index is None:
+ assert not result
+ else:
+ assert result == text[expected_index]
+
+ if expected_log is None:
+ assert "no_index" not in caplog.text
+ else:
+ assert expected_log in caplog.text