More complex sorting for text extraction

When extracting text from TextEquiv nodes we may encounter nodes without index or nodes that should get sorted via the conf attribute. Therefore we added a more complex algorithm to extract a TextEquiv and inform the user via log messages if we encounter structures that we can handle but may produce unexpected results.
2025-12-14 22:54:13 +01:00 · 2020-10-29 09:51:15 +01:00 · 2020-10-29 09:51:15 +01:00 · 7b27b2834e
commit 7b27b2834e
parent 6ff831dfd2
2 changed files with 133 additions and 93 deletions
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -4,9 +4,13 @@ import unicodedata
 from contextlib import suppress
 from itertools import repeat
 from typing import Optional
 from lxml import etree as ET
 import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
 LOG = getLogger('processor.OcrdDinglehopperEvaluate')
 class Normalization(enum.Enum):
@ -53,7 +57,9 @@ def unjoin_ligatures(s):
        '\uf534': 'us',  # eMOP: Latin small ligature us
        '\uf535': 'Qu',  # eMOP: Latin ligature capital Q small u
        'ĳ': 'ij',  # U+0133 LATIN SMALL LIGATURE IJ
-        '\uE8BF': 'q&',  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET  XXX How to replace this correctly?
+        '\uE8BF': 'q&',
        # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET
        # XXX How to replace this correctly?
        '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
        'ﬆ': 'st',  # U+FB06 LATIN SMALL LIGATURE ST
    }
@ -178,27 +184,6 @@ class ExtractedText:
    def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'):
        """Build an ExtractedText from a PAGE content text element"""
        def invert_dict(d):
            """Invert the given dict"""
            return {v: k for k, v in d.items()}
        def get_textequiv_unicode(s):
            """Get the TextEquiv/Unicode text of the given PAGE text element"""
            textequivs = s.findall('./page:TextEquiv', namespaces=nsmap)
            if not textequivs:
                return None
            def get_index(te):
                index = te.attrib.get('index')
                try:
                    return int(index)
                except TypeError:
                    return float('inf')
            textequivs = sorted(textequivs, key=get_index)
            return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
        localname_for_textequiv_level = {
            'region': 'TextRegion',
            'line': 'TextLine'
@ -216,9 +201,9 @@ class ExtractedText:
        if localname == localname_for_textequiv_level[textequiv_level]:
            segment_text = None
            with suppress(AttributeError):
-                segment_text = get_textequiv_unicode(text_segment)
+                segment_text = get_textequiv_unicode(text_segment, nsmap)
-                segment_text = segment_text or ''
+                # FIXME hardcoded SBB normalization
-                segment_text = normalize_sbb(segment_text)  # FIXME hardcoded SBB normalization
+                segment_text = normalize_sbb(segment_text)
            segment_text = segment_text or ''
            return cls(segment_id, None, None, segment_text)
        else:
@ -226,7 +211,8 @@ class ExtractedText:
            sub_localname = children_for_localname[localname]
            sub_textequiv_level = textequiv_level_for_localname[sub_localname]
            segments = []
-            for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap):
+            for sub_segment in text_segment.iterfind('./page:%s' % sub_localname,
                                                     namespaces=nsmap):
                segments.append(
                    ExtractedText.from_text_segment(
                        sub_segment, nsmap,
@ -235,8 +221,63 @@ class ExtractedText:
            joiner = joiner_for_textequiv_level[sub_textequiv_level]
            return cls(segment_id, segments, joiner, None)
    @classmethod
    def from_str(cls, text, normalization=Normalization.NFC_SBB):
        normalized_text = normalize(text, normalization)
        return cls(None, None, None, normalized_text, normalization=normalization)
 def invert_dict(d):
    """Invert the given dict."""
    return {v: k for k, v in d.items()}
 def get_textequiv_unicode(text_segment, nsmap) -> str:
    """Get the TextEquiv/Unicode text of the given PAGE text element."""
    segment_id = text_segment.attrib['id']
    textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap)
    if not textequivs:
        return ''
    textequiv = get_first_textequiv(textequivs, segment_id)
    return textequiv.find('./page:Unicode', namespaces=nsmap).text
 def get_first_textequiv(textequivs, segment_id):
    """Get the first TextEquiv based on index or conf order if index is not present."""
    if len(textequivs) == 1:
        return textequivs[0]
    # try ordering by index
    indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float)
    nan_mask = np.isnan(indices)
    if np.any(~nan_mask):
        if np.any(nan_mask):
            LOG.warning("TextEquiv without index in %s.", segment_id)
        index = np.nanargmin(indices)
    else:
        # try ordering by conf
        confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float)
        if np.any(~np.isnan(confidences)):
            LOG.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
                     segment_id)
            index = np.nanargmax(confidences)
        else:
            # fallback to first entry in case of neither index or conf present
            LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
            index = 0
    return textequivs[index]
 def get_attr(te, attr_name) -> float:
    """Extract the attribute for the given name.
    Note: currently only handles numeric values!
    Other or non existend values are encoded as np.nan.
    """
    attr_value = te.attrib.get(attr_name)
    try:
        return float(attr_value)
    except TypeError:
        return np.nan
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -1,8 +1,10 @@
 import logging
 import unicodedata
 import pytest
 from uniseg.graphemecluster import grapheme_clusters
 from collections import namedtuple
 import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters
 from .. import seq_align, ExtractedText
@ -45,12 +47,17 @@ def test_align():
    test2 = ExtractedText(None, [
        ExtractedText('x0', None, None, 'foo'),
        ExtractedText('x1', None, None, 'bar'),
-        ExtractedText('x2', None, None, '.'),  # extra .
+        # extra .
-        ExtractedText('x3', None, None, 'bazim̃ga'),  # deletion + different grapheme cluster, m̃ also is two Python characters
+        ExtractedText('x2', None, None, '.'),
        # deletion + different grapheme cluster, m̃ also is two Python characters
        ExtractedText('x3', None, None, 'bazim̃ga'),
    ], ' ', None)
-    left_pos = 0; right_pos = 0; alignment = []
+    left_pos = 0
-    for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)):
+    right_pos = 0
    alignment = []
    for left, right in seq_align(grapheme_clusters(test1.text),
                                 grapheme_clusters(test2.text)):
        left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
        right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
        el = AlignmentElement(left, right, left_id, right_id)
@ -69,50 +76,42 @@ def test_align():
    assert alignment[15] == ('n', 'm̃', 's2', 'x3')
-def test_textequiv_index():
+@pytest.mark.parametrize("attributes,expected_index,expected_log", [
-    """
+    ([], None, None),
-    Test that extracting text from a PAGE TextEquiv honors the "index".
+    (['index="0"'], 0, None),
-    """
+    ([''], 0, None),
    (['conf="0.5"'], 0, None),
    (['index="1"', 'index="0"'], 1, None),
    (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
    (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2,
     "No index attributes, use 'conf' attribute to sort TextEquiv"),
    (['index="0"', ''], 0, "TextEquiv without index"),
    (['', 'conf="0.4"'], 1,
     "No index attributes, use 'conf' attribute to sort TextEquiv"),
    (['', ''], 0, "No index attributes, use first TextEquiv"),
 ])
 def test_textequiv(attributes, expected_index, expected_log, caplog):
    """Test that extracting text from a PAGE TextEquiv is working without index attr."""
    caplog.set_level(logging.INFO)
    xml = "<?xml version=\"1.0\"?>"
    ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
    text = [f"Text {i}" for i in range(len(attributes) + 1)]
    equiv = [f"<TextEquiv {attr}><Unicode>{text[i]}</Unicode></TextEquiv>"
             for i, attr in enumerate(attributes)]
    textline = f"{xml}<TextLine id=\"l3\" xmlns=\"{ns}\">{''.join(equiv)}</TextLine>"
    # This example textline has two TextEquivs, the one with the lowest index
    # should be used. The XML order of the TextEquivs is deliberately not
    # in index order.
    textline="""<?xml version="1.0"?>
      <TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
        <TextEquiv index="1">
          <Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
        </TextEquiv>
        <TextEquiv index="0">
          <Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
        </TextEquiv>
      </TextLine>
    """
    root = ET.fromstring(textline)
-    nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
+    result = ExtractedText.from_text_segment(root,
-    result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
+                                             {'page': ns},
-    expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
+                                             textequiv_level='line').text
    if expected_index is None:
        assert not result
    else:
        assert result == text[expected_index]
-    assert expected == result
+    if expected_log is None:
-
+        assert "no_index" not in caplog.text
-
+    else:
-def test_textequiv_no_index():
+        assert expected_log in caplog.text
    """
    Test that extracting text from a PAGE TextEquiv ignores missing indices.
    """
    textline="""<?xml version="1.0"?>
      <TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
        <TextEquiv>
          <Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
        </TextEquiv>
        <TextEquiv index="1">
          <Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
        </TextEquiv>
      </TextLine>
    """
    root = ET.fromstring(textline)
    nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
    result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
    expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
    assert expected == result