From 6ff831dfd2f6e883ffec192f31addb40c7146eae Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 27 Oct 2020 12:33:37 +0100 Subject: [PATCH 1/3] Sort textlines with missing indices Python's `sorted` method will fail with a TypeError when called with `None` and Integers: ```python >>> sorted([None, 1]) TypeError: '<' not supported between instances of 'int' and 'NoneType' ``` Therefore we are using `float('inf')` instead of `None` in case of missing textline indices. --- qurator/dinglehopper/extracted_text.py | 2 +- .../dinglehopper/tests/extracted_text_test.py | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 46c8fec..c492e7e 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -194,7 +194,7 @@ class ExtractedText: try: return int(index) except TypeError: - return None + return float('inf') textequivs = sorted(textequivs, key=get_index) return textequivs[0].find('./page:Unicode', namespaces=nsmap).text diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 0d59c99..0caf4f3 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -93,3 +93,26 @@ def test_textequiv_index(): expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" assert expected == result + + +def test_textequiv_no_index(): + """ + Test that extracting text from a PAGE TextEquiv ignores missing indices. + """ + + textline=""" + + + gefahren zu haben, einzelne Bemorkungen und Beobäch- + + + gefahren zu haben, einzelne Bemerkungen und Beobach- + + + """ + root = ET.fromstring(textline) + nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } + result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text + expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" + + assert expected == result \ No newline at end of file From 7b27b2834eb4dbaaa7f77f33323793f26bb788e6 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Thu, 29 Oct 2020 09:51:15 +0100 Subject: [PATCH 2/3] More complex sorting for text extraction When extracting text from TextEquiv nodes we may encounter nodes without index or nodes that should get sorted via the conf attribute. Therefore we added a more complex algorithm to extract a TextEquiv and inform the user via log messages if we encounter structures that we can handle but may produce unexpected results. --- qurator/dinglehopper/extracted_text.py | 119 ++++++++++++------ .../dinglehopper/tests/extracted_text_test.py | 113 +++++++++-------- 2 files changed, 136 insertions(+), 96 deletions(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index c492e7e..a0be84a 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -4,9 +4,13 @@ import unicodedata from contextlib import suppress from itertools import repeat from typing import Optional -from lxml import etree as ET import attr +import numpy as np +from lxml import etree as ET +from ocrd_utils import getLogger + +LOG = getLogger('processor.OcrdDinglehopperEvaluate') class Normalization(enum.Enum): @@ -47,15 +51,17 @@ def unjoin_ligatures(s): 'fl': 'fl', 'ffi': 'ffi', '': 'ct', - '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ + '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ '\uf532': 'as', # eMOP: Latin small ligature as '\uf533': 'is', # eMOP: Latin small ligature is '\uf534': 'us', # eMOP: Latin small ligature us '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u - 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ - '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? + 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ + '\uE8BF': 'q&', + # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET + # XXX How to replace this correctly? '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P - 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST } s = unicodedata.normalize('NFC', s) for fr, to in equivalences.items(): @@ -70,14 +76,14 @@ def substitute_equivalences(s): '': 'ü', '': 'ä', '==': '–', # → en-dash - '—': '–', # em-dash → en-dash + '—': '–', # em-dash → en-dash '': 'ö', '’': '\'', '⸗': '-', - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E - '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E + '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } s = unicodedata.normalize('NFC', s) @@ -178,27 +184,6 @@ class ExtractedText: def from_text_segment(cls, text_segment, nsmap, textequiv_level='region'): """Build an ExtractedText from a PAGE content text element""" - def invert_dict(d): - """Invert the given dict""" - return {v: k for k, v in d.items()} - - def get_textequiv_unicode(s): - """Get the TextEquiv/Unicode text of the given PAGE text element""" - textequivs = s.findall('./page:TextEquiv', namespaces=nsmap) - - if not textequivs: - return None - - def get_index(te): - index = te.attrib.get('index') - try: - return int(index) - except TypeError: - return float('inf') - textequivs = sorted(textequivs, key=get_index) - - return textequivs[0].find('./page:Unicode', namespaces=nsmap).text - localname_for_textequiv_level = { 'region': 'TextRegion', 'line': 'TextLine' @@ -216,9 +201,9 @@ class ExtractedText: if localname == localname_for_textequiv_level[textequiv_level]: segment_text = None with suppress(AttributeError): - segment_text = get_textequiv_unicode(text_segment) - segment_text = segment_text or '' - segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization + segment_text = get_textequiv_unicode(text_segment, nsmap) + # FIXME hardcoded SBB normalization + segment_text = normalize_sbb(segment_text) segment_text = segment_text or '' return cls(segment_id, None, None, segment_text) else: @@ -226,17 +211,73 @@ class ExtractedText: sub_localname = children_for_localname[localname] sub_textequiv_level = textequiv_level_for_localname[sub_localname] segments = [] - for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, namespaces=nsmap): + for sub_segment in text_segment.iterfind('./page:%s' % sub_localname, + namespaces=nsmap): segments.append( - ExtractedText.from_text_segment( - sub_segment, nsmap, - textequiv_level=sub_textequiv_level) + ExtractedText.from_text_segment( + sub_segment, nsmap, + textequiv_level=sub_textequiv_level) ) joiner = joiner_for_textequiv_level[sub_textequiv_level] return cls(segment_id, segments, joiner, None) - @classmethod def from_str(cls, text, normalization=Normalization.NFC_SBB): normalized_text = normalize(text, normalization) return cls(None, None, None, normalized_text, normalization=normalization) + + +def invert_dict(d): + """Invert the given dict.""" + return {v: k for k, v in d.items()} + + +def get_textequiv_unicode(text_segment, nsmap) -> str: + """Get the TextEquiv/Unicode text of the given PAGE text element.""" + segment_id = text_segment.attrib['id'] + textequivs = text_segment.findall('./page:TextEquiv', namespaces=nsmap) + + if not textequivs: + return '' + + textequiv = get_first_textequiv(textequivs, segment_id) + return textequiv.find('./page:Unicode', namespaces=nsmap).text + + +def get_first_textequiv(textequivs, segment_id): + """Get the first TextEquiv based on index or conf order if index is not present.""" + if len(textequivs) == 1: + return textequivs[0] + + # try ordering by index + indices = np.array([get_attr(te, 'index') for te in textequivs], dtype=float) + nan_mask = np.isnan(indices) + if np.any(~nan_mask): + if np.any(nan_mask): + LOG.warning("TextEquiv without index in %s.", segment_id) + index = np.nanargmin(indices) + else: + # try ordering by conf + confidences = np.array([get_attr(te, 'conf') for te in textequivs], dtype=float) + if np.any(~np.isnan(confidences)): + LOG.info("No index attributes, use 'conf' attribute to sort TextEquiv in %s.", + segment_id) + index = np.nanargmax(confidences) + else: + # fallback to first entry in case of neither index or conf present + LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id) + index = 0 + return textequivs[index] + + +def get_attr(te, attr_name) -> float: + """Extract the attribute for the given name. + + Note: currently only handles numeric values! + Other or non existend values are encoded as np.nan. + """ + attr_value = te.attrib.get(attr_name) + try: + return float(attr_value) + except TypeError: + return np.nan diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 0caf4f3..504d2ad 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -1,8 +1,10 @@ +import logging import unicodedata -import pytest -from uniseg.graphemecluster import grapheme_clusters from collections import namedtuple + +import pytest from lxml import etree as ET +from uniseg.graphemecluster import grapheme_clusters from .. import seq_align, ExtractedText @@ -45,12 +47,17 @@ def test_align(): test2 = ExtractedText(None, [ ExtractedText('x0', None, None, 'foo'), ExtractedText('x1', None, None, 'bar'), - ExtractedText('x2', None, None, '.'), # extra . - ExtractedText('x3', None, None, 'bazim̃ga'), # deletion + different grapheme cluster, m̃ also is two Python characters + # extra . + ExtractedText('x2', None, None, '.'), + # deletion + different grapheme cluster, m̃ also is two Python characters + ExtractedText('x3', None, None, 'bazim̃ga'), ], ' ', None) - left_pos = 0; right_pos = 0; alignment = [] - for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): + left_pos = 0 + right_pos = 0 + alignment = [] + for left, right in seq_align(grapheme_clusters(test1.text), + grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None el = AlignmentElement(left, right, left_id, right_id) @@ -63,56 +70,48 @@ def test_align(): print('test1: {}'.format(test1.text)) print('test2: {}'.format(test2.text)) - assert alignment[0] == ('f', 'f', 's0', 'x0') - assert alignment[8] == (None, '.', None, 'x2') - assert alignment[12] == ('t', None, 's2', None) - assert alignment[15] == ('n', 'm̃', 's2', 'x3') + assert alignment[0] == ('f', 'f', 's0', 'x0') + assert alignment[8] == (None, '.', None, 'x2') + assert alignment[12] == ('t', None, 's2', None) + assert alignment[15] == ('n', 'm̃', 's2', 'x3') + + +@pytest.mark.parametrize("attributes,expected_index,expected_log", [ + ([], None, None), + (['index="0"'], 0, None), + ([''], 0, None), + (['conf="0.5"'], 0, None), + (['index="1"', 'index="0"'], 1, None), + (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"), + (['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2, + "No index attributes, use 'conf' attribute to sort TextEquiv"), + (['index="0"', ''], 0, "TextEquiv without index"), + (['', 'conf="0.4"'], 1, + "No index attributes, use 'conf' attribute to sort TextEquiv"), + (['', ''], 0, "No index attributes, use first TextEquiv"), +]) +def test_textequiv(attributes, expected_index, expected_log, caplog): + """Test that extracting text from a PAGE TextEquiv is working without index attr.""" + caplog.set_level(logging.INFO) + xml = "" + ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" + text = [f"Text {i}" for i in range(len(attributes) + 1)] + + equiv = [f"{text[i]}" + for i, attr in enumerate(attributes)] + + textline = f"{xml}{''.join(equiv)}" - -def test_textequiv_index(): - """ - Test that extracting text from a PAGE TextEquiv honors the "index". - """ - - # This example textline has two TextEquivs, the one with the lowest index - # should be used. The XML order of the TextEquivs is deliberately not - # in index order. - textline=""" - - - gefahren zu haben, einzelne Bemorkungen und Beobäch- - - - gefahren zu haben, einzelne Bemerkungen und Beobach- - - - """ root = ET.fromstring(textline) - nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } - result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text - expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" - - assert expected == result - - -def test_textequiv_no_index(): - """ - Test that extracting text from a PAGE TextEquiv ignores missing indices. - """ - - textline=""" - - - gefahren zu haben, einzelne Bemorkungen und Beobäch- - - - gefahren zu haben, einzelne Bemerkungen und Beobach- - - - """ - root = ET.fromstring(textline) - nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } - result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text - expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" - - assert expected == result \ No newline at end of file + result = ExtractedText.from_text_segment(root, + {'page': ns}, + textequiv_level='line').text + if expected_index is None: + assert not result + else: + assert result == text[expected_index] + + if expected_log is None: + assert "no_index" not in caplog.text + else: + assert expected_log in caplog.text From c02569b41ee1143b4174b278accf50c22a62f4b0 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Thu, 29 Oct 2020 12:33:54 +0100 Subject: [PATCH 3/3] Fix f-strings for Python 3.5 --- qurator/dinglehopper/extracted_text.py | 2 +- qurator/dinglehopper/tests/extracted_text_test.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index a0be84a..916b123 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -241,7 +241,7 @@ def get_textequiv_unicode(text_segment, nsmap) -> str: return '' textequiv = get_first_textequiv(textequivs, segment_id) - return textequiv.find('./page:Unicode', namespaces=nsmap).text + return textequiv.find('./page:Unicode', namespaces=nsmap).text or '' def get_first_textequiv(textequivs, segment_id): diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 504d2ad..2ce81cd 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -95,12 +95,13 @@ def test_textequiv(attributes, expected_index, expected_log, caplog): caplog.set_level(logging.INFO) xml = "" ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" - text = [f"Text {i}" for i in range(len(attributes) + 1)] + text = ["Text {0}".format(i) for i in range(len(attributes) + 1)] - equiv = [f"{text[i]}" + equiv = ["{1}".format(attr, text[i]) for i, attr in enumerate(attributes)] - textline = f"{xml}{''.join(equiv)}" + textline = "{0}{2}" + textline = textline.format(xml, ns, ''.join(equiv)) root = ET.fromstring(textline) result = ExtractedText.from_text_segment(root,