mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 20:00:01 +02:00
Sort textlines with missing indices
Python's `sorted` method will fail with a TypeError when called with `None` and Integers: ```python >>> sorted([None, 1]) TypeError: '<' not supported between instances of 'int' and 'NoneType' ``` Therefore we are using `float('inf')` instead of `None` in case of missing textline indices.
This commit is contained in:
parent
082fc9e09a
commit
6ff831dfd2
2 changed files with 24 additions and 1 deletions
|
@ -194,7 +194,7 @@ class ExtractedText:
|
||||||
try:
|
try:
|
||||||
return int(index)
|
return int(index)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
return None
|
return float('inf')
|
||||||
textequivs = sorted(textequivs, key=get_index)
|
textequivs = sorted(textequivs, key=get_index)
|
||||||
|
|
||||||
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
|
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
|
||||||
|
|
|
@ -93,3 +93,26 @@ def test_textequiv_index():
|
||||||
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
||||||
|
|
||||||
assert expected == result
|
assert expected == result
|
||||||
|
|
||||||
|
|
||||||
|
def test_textequiv_no_index():
|
||||||
|
"""
|
||||||
|
Test that extracting text from a PAGE TextEquiv ignores missing indices.
|
||||||
|
"""
|
||||||
|
|
||||||
|
textline="""<?xml version="1.0"?>
|
||||||
|
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
|
||||||
|
<TextEquiv>
|
||||||
|
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
|
||||||
|
</TextEquiv>
|
||||||
|
<TextEquiv index="1">
|
||||||
|
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
|
||||||
|
</TextEquiv>
|
||||||
|
</TextLine>
|
||||||
|
"""
|
||||||
|
root = ET.fromstring(textline)
|
||||||
|
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
|
||||||
|
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
|
||||||
|
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
||||||
|
|
||||||
|
assert expected == result
|
Loading…
Add table
Add a link
Reference in a new issue