1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-12 11:59:58 +02:00

🐛 dinglehopper: Honor TextEquiv index (Closes GH-33)

This commit is contained in:
Gerber, Mike 2020-10-21 17:50:21 +02:00
parent f626a2ebe6
commit 19d15e3ecc
2 changed files with 41 additions and 1 deletions

View file

@ -2,6 +2,7 @@ import unicodedata
import pytest
from uniseg.graphemecluster import grapheme_clusters
from collections import namedtuple
from lxml import etree as ET
from .. import seq_align, ExtractedText
@ -66,3 +67,29 @@ def test_align():
assert alignment[8] == (None, '.', None, 'x2')
assert alignment[12] == ('t', None, 's2', None)
assert alignment[15] == ('n', '', 's2', 'x3')
def test_textequiv_index():
"""
Test that extracting text from a PAGE TextEquiv honors the "index".
"""
# This example textline has two TextEquivs, the one with the lowest index
# should be used. The XML order of the TextEquivs is deliberately not
# in index order.
textline="""<?xml version="1.0"?>
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
<TextEquiv index="1">
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
</TextEquiv>
<TextEquiv index="0">
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
</TextEquiv>
</TextLine>
"""
root = ET.fromstring(textline)
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
assert expected == result