Sort textlines with missing indices

Python's `sorted` method will fail with a TypeError when called with
`None` and Integers:

```python
>>> sorted([None, 1])
TypeError: '<' not supported between instances of 'int' and 'NoneType'
```
Therefore we are using `float('inf')` instead of `None` in case of
missing textline indices.
pull/37/head
Benjamin Rosemann 4 years ago
parent 082fc9e09a
commit 6ff831dfd2

@ -194,7 +194,7 @@ class ExtractedText:
try: try:
return int(index) return int(index)
except TypeError: except TypeError:
return None return float('inf')
textequivs = sorted(textequivs, key=get_index) textequivs = sorted(textequivs, key=get_index)
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text return textequivs[0].find('./page:Unicode', namespaces=nsmap).text

@ -93,3 +93,26 @@ def test_textequiv_index():
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-" expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
assert expected == result assert expected == result
def test_textequiv_no_index():
"""
Test that extracting text from a PAGE TextEquiv ignores missing indices.
"""
textline="""<?xml version="1.0"?>
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
<TextEquiv>
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
</TextEquiv>
<TextEquiv index="1">
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
</TextEquiv>
</TextLine>
"""
root = ET.fromstring(textline)
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
assert expected == result
Loading…
Cancel
Save