diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 46c8fec..c492e7e 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -194,7 +194,7 @@ class ExtractedText: try: return int(index) except TypeError: - return None + return float('inf') textequivs = sorted(textequivs, key=get_index) return textequivs[0].find('./page:Unicode', namespaces=nsmap).text diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 0d59c99..0caf4f3 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -93,3 +93,26 @@ def test_textequiv_index(): expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" assert expected == result + + +def test_textequiv_no_index(): + """ + Test that extracting text from a PAGE TextEquiv ignores missing indices. + """ + + textline=""" + + + gefahren zu haben, einzelne Bemorkungen und Beobäch- + + + gefahren zu haben, einzelne Bemerkungen und Beobach- + + + """ + root = ET.fromstring(textline) + nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } + result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text + expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" + + assert expected == result \ No newline at end of file