From 19d15e3eccafd192b6fb56d64e3877da25ecf68a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 21 Oct 2020 17:50:21 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Honor=20TextEqui?= =?UTF-8?q?v=20index=20(Closes=20GH-33)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/extracted_text.py | 15 ++++++++++- .../dinglehopper/tests/extracted_text_test.py | 27 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index c039000..352c697 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -182,6 +182,19 @@ class ExtractedText: """Invert the given dict""" return {v: k for k, v in d.items()} + def get_textequiv_unicode(s): + """Get the TextEquiv/Unicode text of the given PAGE text element""" + textequivs = s.findall('./page:TextEquiv', namespaces=nsmap) + def get_index(te): + index = te.attrib.get('index') + try: + return int(index) + except TypeError: + return None + textequivs = sorted(textequivs, key=get_index) + + return textequivs[0].find('./page:Unicode', namespaces=nsmap).text + localname_for_textequiv_level = { 'region': 'TextRegion', 'line': 'TextLine' @@ -199,7 +212,7 @@ class ExtractedText: if localname == localname_for_textequiv_level[textequiv_level]: segment_text = None with suppress(AttributeError): - segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text + segment_text = get_textequiv_unicode(text_segment) segment_text = segment_text or '' segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization segment_text = segment_text or '' diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 98788f6..0d59c99 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -2,6 +2,7 @@ import unicodedata import pytest from uniseg.graphemecluster import grapheme_clusters from collections import namedtuple +from lxml import etree as ET from .. import seq_align, ExtractedText @@ -66,3 +67,29 @@ def test_align(): assert alignment[8] == (None, '.', None, 'x2') assert alignment[12] == ('t', None, 's2', None) assert alignment[15] == ('n', 'm̃', 's2', 'x3') + + +def test_textequiv_index(): + """ + Test that extracting text from a PAGE TextEquiv honors the "index". + """ + + # This example textline has two TextEquivs, the one with the lowest index + # should be used. The XML order of the TextEquivs is deliberately not + # in index order. + textline=""" + + + gefahren zu haben, einzelne Bemorkungen und Beobäch- + + + gefahren zu haben, einzelne Bemerkungen und Beobach- + + + """ + root = ET.fromstring(textline) + nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } + result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text + expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" + + assert expected == result