mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 03:40:12 +02:00
Sort textlines with missing indices
Python's `sorted` method will fail with a TypeError when called with `None` and Integers: ```python >>> sorted([None, 1]) TypeError: '<' not supported between instances of 'int' and 'NoneType' ``` Therefore we are using `float('inf')` instead of `None` in case of missing textline indices.
This commit is contained in:
parent
082fc9e09a
commit
6ff831dfd2
2 changed files with 24 additions and 1 deletions
|
@ -194,7 +194,7 @@ class ExtractedText:
|
|||
try:
|
||||
return int(index)
|
||||
except TypeError:
|
||||
return None
|
||||
return float('inf')
|
||||
textequivs = sorted(textequivs, key=get_index)
|
||||
|
||||
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
|
||||
|
|
|
@ -93,3 +93,26 @@ def test_textequiv_index():
|
|||
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
||||
|
||||
assert expected == result
|
||||
|
||||
|
||||
def test_textequiv_no_index():
|
||||
"""
|
||||
Test that extracting text from a PAGE TextEquiv ignores missing indices.
|
||||
"""
|
||||
|
||||
textline="""<?xml version="1.0"?>
|
||||
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
|
||||
<TextEquiv>
|
||||
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="1">
|
||||
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
"""
|
||||
root = ET.fromstring(textline)
|
||||
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
|
||||
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
|
||||
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
||||
|
||||
assert expected == result
|
Loading…
Add table
Add a link
Reference in a new issue