diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index a0be84a..916b123 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -241,7 +241,7 @@ def get_textequiv_unicode(text_segment, nsmap) -> str: return '' textequiv = get_first_textequiv(textequivs, segment_id) - return textequiv.find('./page:Unicode', namespaces=nsmap).text + return textequiv.find('./page:Unicode', namespaces=nsmap).text or '' def get_first_textequiv(textequivs, segment_id): diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 504d2ad..2ce81cd 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -95,12 +95,13 @@ def test_textequiv(attributes, expected_index, expected_log, caplog): caplog.set_level(logging.INFO) xml = "" ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" - text = [f"Text {i}" for i in range(len(attributes) + 1)] + text = ["Text {0}".format(i) for i in range(len(attributes) + 1)] - equiv = [f"{text[i]}" + equiv = ["{1}".format(attr, text[i]) for i, attr in enumerate(attributes)] - textline = f"{xml}{''.join(equiv)}" + textline = "{0}{2}" + textline = textline.format(xml, ns, ''.join(equiv)) root = ET.fromstring(textline) result = ExtractedText.from_text_segment(root,