diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 46c8fec..916b123 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -4,9 +4,13 @@ import unicodedata from contextlib import suppress from itertools import repeat from typing import Optional -from lxml import etree as ET import attr +import numpy as np +from lxml import etree as ET +from ocrd_utils import getLogger + +LOG = getLogger('processor.OcrdDinglehopperEvaluate') class Normalization(enum.Enum): @@ -47,15 +51,17 @@ def unjoin_ligatures(s): 'fl': 'fl', 'ffi': 'ffi', '': 'ct', - '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ + '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ '\uf532': 'as', # eMOP: Latin small ligature as '\uf533': 'is', # eMOP: Latin small ligature is '\uf534': 'us', # eMOP: Latin small ligature us '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u - 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ - '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? + 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ + '\uE8BF': 'q&', + # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET + # XXX How to replace this correctly? '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P - 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST } s = unicodedata.normalize('NFC', s) for fr, to in equivalences.items(): @@ -70,14 +76,14 @@ def substitute_equivalences(s): '': 'ü', '': 'ä', '==': '–', # → en-dash - '—': '–', # em-dash → en-dash + '—': '–', # em-dash → en-dash '': 'ö', '’': '\'', '⸗': '-', - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E - '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E + '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } s = unicodedata.normalize('NFC', s) @@ -178,27 +184,6 @@ class ExtractedText: def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'): """Build an ExtractedText from a PAGE content text element""" - def invert_dict(d): - """Invert the given dict""" - return {v: k for k, v in d.items()} - - def get_textequiv_unicode(s): - """Get the TextEquiv/Unicode text of the given PAGE text element""" - textequivs = s.findall('./page:TextEquiv', namespaces=nsmap) - - if not textequivs: - return None - - def get_index(te): - index = te.attrib.get('index') - try: - return int(index) - except TypeError: - return None - textequivs = sorted(textequivs, key=get_index) - - return textequivs[0].find('./page:Unicode', namespaces=nsmap).text - localname_for_textequiv_level = { 'region': 'TextRegion', 'line': 'TextLine' @@ -216,9 +201,9 @@ class ExtractedText: if localname == localname_for_textequiv_level[textequiv_level]: segment_text = None with suppress(AttributeError): - segment_text = get_textequiv_unicode(text_segment) - segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization + segment_text = get_textequiv_unicode(text_segment, nsmap) + # FIXME hardcoded SBB normalization + segment_text = normalize_sbb(segment_text) segment_text = segment_text or '' return cls(segment_id, None, None, segment_text) else: @@ -226,17 +211,73 @@ class ExtractedText: sub_localname = children_for_localname[localname] sub_textequiv_level = textequiv_level_for_localname[sub_localname] segments = [] - for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap): + for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, + namespaces=nsmap): segments.append( - ExtractedText.from_text_segment( - sub_segment, nsmap, - textequiv_level=sub_textequiv_level) + ExtractedText.from_text_segment( + sub_segment, nsmap, + textequiv_level=sub_textequiv_level) ) joiner = joiner_for_textequiv_level[sub_textequiv_level] return cls(segment_id, segments, joiner, None) - @classmethod def from_str(cls, text, normalization=Normalization.NFC_SBB): normalized_text = normalize(text, normalization) return cls(None, None, None, normalized_text, normalization=normalization) + + +def invert_dict(d): + """Invert the given dict.""" + return {v: k for k, v in d.items()} + + +def get_textequiv_unicode(text_segment, nsmap) -> str: + """Get the TextEquiv/Unicode text of the given PAGE text element.""" + segment_id = text_segment.attrib['id'] + textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap) + + if not textequivs: + return '' + + textequiv = get_first_textequiv(textequivs, segment_id) + return textequiv.find('./page:Unicode', namespaces=nsmap).text or '' + + +def get_first_textequiv(textequivs, segment_id): + """Get the first TextEquiv based on index or conf order if index is not present.""" + if len(textequivs) == 1: + return textequivs[0] + + # try ordering by index + indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float) + nan_mask = np.isnan(indices) + if np.any(~nan_mask): + if np.any(nan_mask): + LOG.warning("TextEquiv without index in %s.", segment_id) + index = np.nanargmin(indices) + else: + # try ordering by conf + confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float) + if np.any(~np.isnan(confidences)): + LOG.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.", + segment_id) + index = np.nanargmax(confidences) + else: + # fallback to first entry in case of neither index or conf present + LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id) + index = 0 + return textequivs[index] + + +def get_attr(te, attr_name) -> float: + """Extract the attribute for the given name. + + Note: currently only handles numeric values! + Other or non existend values are encoded as np.nan. + """ + attr_value = te.attrib.get(attr_name) + try: + return float(attr_value) + except TypeError: + return np.nan diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 0d59c99..2ce81cd 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -1,8 +1,10 @@ +import logging import unicodedata -import pytest -from uniseg.graphemecluster import grapheme_clusters from collections import namedtuple + +import pytest from lxml import etree as ET +from uniseg.graphemecluster import grapheme_clusters from .. import seq_align, ExtractedText @@ -45,12 +47,17 @@ def test_align(): test2 = ExtractedText(None, [ ExtractedText('x0', None, None, 'foo'), ExtractedText('x1', None, None, 'bar'), - ExtractedText('x2', None, None, '.'), # extra . - ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters + # extra . + ExtractedText('x2', None, None, '.'), + # deletion + different grapheme cluster, m̃ also is two Python characters + ExtractedText('x3', None, None, 'bazim̃ga'), ], ' ', None) - left_pos = 0; right_pos = 0; alignment = [] - for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): + left_pos = 0 + right_pos = 0 + alignment = [] + for left, right in seq_align(grapheme_clusters(test1.text), + grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None el = AlignmentElement(left, right, left_id, right_id) @@ -63,33 +70,49 @@ def test_align(): print('test1: {}'.format(test1.text)) print('test2: {}'.format(test2.text)) - assert alignment[0] == ('f', 'f', 's0', 'x0') - assert alignment[8] == (None, '.', None, 'x2') - assert alignment[12] == ('t', None, 's2', None) - assert alignment[15] == ('n', 'm̃', 's2', 'x3') - + assert alignment[0] == ('f', 'f', 's0', 'x0') + assert alignment[8] == (None, '.', None, 'x2') + assert alignment[12] == ('t', None, 's2', None) + assert alignment[15] == ('n', 'm̃', 's2', 'x3') + + +@pytest.mark.parametrize("attributes,expected_index,expected_log", [ + ([], None, None), + (['index="0"'], 0, None), + ([''], 0, None), + (['conf="0.5"'], 0, None), + (['index="1"', 'index="0"'], 1, None), + (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"), + (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2, + "No index attributes, use 'conf' attribute to sort TextEquiv"), + (['index="0"', ''], 0, "TextEquiv without index"), + (['', 'conf="0.4"'], 1, + "No index attributes, use 'conf' attribute to sort TextEquiv"), + (['', ''], 0, "No index attributes, use first TextEquiv"), +]) +def test_textequiv(attributes, expected_index, expected_log, caplog): + """Test that extracting text from a PAGE TextEquiv is working without index attr.""" + caplog.set_level(logging.INFO) + xml = "" + ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" + text = ["Text {0}".format(i) for i in range(len(attributes) + 1)] + + equiv = ["{1}".format(attr, text[i]) + for i, attr in enumerate(attributes)] + + textline = "{0}{2}" + textline = textline.format(xml, ns, ''.join(equiv)) -def test_textequiv_index(): - """ - Test that extracting text from a PAGE TextEquiv honors the "index". - """ - - # This example textline has two TextEquivs, the one with the lowest index - # should be used. The XML order of the TextEquivs is deliberately not - # in index order. - textline=""" - - - gefahren zu haben, einzelne Bemorkungen und Beobäch- - - - gefahren zu haben, einzelne Bemerkungen und Beobach- - - - """ root = ET.fromstring(textline) - nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } - result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text - expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" - - assert expected == result + result = ExtractedText.from_text_segment(root, + {'page': ns}, + textequiv_level='line').text + if expected_index is None: + assert not result + else: + assert result == text[expected_index] + + if expected_log is None: + assert "no_index" not in caplog.text + else: + assert expected_log in caplog.text