From 6ff831dfd2f6e883ffec192f31addb40c7146eae Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 27 Oct 2020 12:33:37 +0100 Subject: [PATCH] Sort textlines with missing indices Python's `sorted` method will fail with a TypeError when called with `None` and Integers: ```python >>> sorted([None, 1]) TypeError: '<' not supported between instances of 'int' and 'NoneType' ``` Therefore we are using `float('inf')` instead of `None` in case of missing textline indices. --- qurator/dinglehopper/extracted_text.py | 2 +- .../dinglehopper/tests/extracted_text_test.py | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 46c8fec..c492e7e 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -194,7 +194,7 @@ class ExtractedText: try: return int(index) except TypeError: - return None + return float('inf') textequivs = sorted(textequivs, key=get_index) return textequivs[0].find('./page:Unicode', namespaces=nsmap).text diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 0d59c99..0caf4f3 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -93,3 +93,26 @@ def test_textequiv_index(): expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" assert expected == result + + +def test_textequiv_no_index(): + """ + Test that extracting text from a PAGE TextEquiv ignores missing indices. + """ + + textline=""" + + + gefahren zu haben, einzelne Bemorkungen und Beobäch- + + + gefahren zu haben, einzelne Bemerkungen und Beobach- + + + """ + root = ET.fromstring(textline) + nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" } + result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text + expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" + + assert expected == result \ No newline at end of file