mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🐛 dinglehopper: Honor TextEquiv index (Closes GH-33)
This commit is contained in:
parent
f626a2ebe6
commit
19d15e3ecc
2 changed files with 41 additions and 1 deletions
|
@ -182,6 +182,19 @@ class ExtractedText:
|
|||
"""Invert the given dict"""
|
||||
return {v: k for k, v in d.items()}
|
||||
|
||||
def get_textequiv_unicode(s):
|
||||
"""Get the TextEquiv/Unicode text of the given PAGE text element"""
|
||||
textequivs = s.findall('./page:TextEquiv', namespaces=nsmap)
|
||||
def get_index(te):
|
||||
index = te.attrib.get('index')
|
||||
try:
|
||||
return int(index)
|
||||
except TypeError:
|
||||
return None
|
||||
textequivs = sorted(textequivs, key=get_index)
|
||||
|
||||
return textequivs[0].find('./page:Unicode', namespaces=nsmap).text
|
||||
|
||||
localname_for_textequiv_level = {
|
||||
'region': 'TextRegion',
|
||||
'line': 'TextLine'
|
||||
|
@ -199,7 +212,7 @@ class ExtractedText:
|
|||
if localname == localname_for_textequiv_level[textequiv_level]:
|
||||
segment_text = None
|
||||
with suppress(AttributeError):
|
||||
segment_text = text_segment.find('./page:TextEquiv/page:Unicode', namespaces=nsmap).text
|
||||
segment_text = get_textequiv_unicode(text_segment)
|
||||
segment_text = segment_text or ''
|
||||
segment_text = normalize_sbb(segment_text) # FIXME hardcoded SBB normalization
|
||||
segment_text = segment_text or ''
|
||||
|
|
|
@ -2,6 +2,7 @@ import unicodedata
|
|||
import pytest
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from collections import namedtuple
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import seq_align, ExtractedText
|
||||
|
||||
|
@ -66,3 +67,29 @@ def test_align():
|
|||
assert alignment[8] == (None, '.', None, 'x2')
|
||||
assert alignment[12] == ('t', None, 's2', None)
|
||||
assert alignment[15] == ('n', 'm̃', 's2', 'x3')
|
||||
|
||||
|
||||
def test_textequiv_index():
|
||||
"""
|
||||
Test that extracting text from a PAGE TextEquiv honors the "index".
|
||||
"""
|
||||
|
||||
# This example textline has two TextEquivs, the one with the lowest index
|
||||
# should be used. The XML order of the TextEquivs is deliberately not
|
||||
# in index order.
|
||||
textline="""<?xml version="1.0"?>
|
||||
<TextLine id="l3" xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15">
|
||||
<TextEquiv index="1">
|
||||
<Unicode>gefahren zu haben, einzelne Bemorkungen und Beobäch-</Unicode>
|
||||
</TextEquiv>
|
||||
<TextEquiv index="0">
|
||||
<Unicode>gefahren zu haben, einzelne Bemerkungen und Beobach-</Unicode>
|
||||
</TextEquiv>
|
||||
</TextLine>
|
||||
"""
|
||||
root = ET.fromstring(textline)
|
||||
nsmap = {'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" }
|
||||
result = ExtractedText.from_text_segment(root, nsmap, textequiv_level='line').text
|
||||
expected = "gefahren zu haben, einzelne Bemerkungen und Beobach-"
|
||||
|
||||
assert expected == result
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue