You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

146 lines
4.4 KiB

import logging
import unicodedata
from collections import namedtuple
import pytest
from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters
from .. import seq_align, ExtractedText
def test_text():
test1 = ExtractedText(
ExtractedText("s0", None, None, "foo"),
ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "bazinga"),
" ",
assert test1.text == "foo bar bazinga"
assert test1.segment_id_for_pos(0) == "s0"
assert test1.segment_id_for_pos(3) is None
assert test1.segment_id_for_pos(10) == "s2"
def test_normalization_check():
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
def test_align():
Test aligning by character while retaining segment id info
The difficulty here is that aligning should work on grapheme clusters,
not Python characters.
test1 = ExtractedText(
ExtractedText("s0", None, None, "foo"),
ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "batzinga"),
" ",
test2 = ExtractedText(
ExtractedText("x0", None, None, "foo"),
ExtractedText("x1", None, None, "bar"),
# extra .
ExtractedText("x2", None, None, "."),
# deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText("x3", None, None, "bazim̃ga"),
" ",
left_pos = 0
right_pos = 0
alignment = []
for left, right in seq_align(
grapheme_clusters(test1.text), grapheme_clusters(test2.text)
left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
el = AlignmentElement(left, right, left_id, right_id)
if left is not None:
left_pos += len(left)
if right is not None:
right_pos += len(right)
print("test1: {}".format(test1.text))
print("test2: {}".format(test2.text))
assert alignment[0] == ("f", "f", "s0", "x0")
assert alignment[8] == (None, ".", None, "x2")
assert alignment[12] == ("t", None, "s2", None)
assert alignment[15] == ("n", "", "s2", "x3")
([], None, None),
(['index="0"'], 0, None),
([""], 0, None),
(['conf="0.5"'], 0, None),
(['index="1"', 'index="0"'], 1, None),
(['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
"No index attributes, use 'conf' attribute to sort TextEquiv",
(['index="0"', ""], 0, "TextEquiv without index"),
["", 'conf="0.4"'],
"No index attributes, use 'conf' attribute to sort TextEquiv",
(["", ""], 0, "No index attributes, use first TextEquiv"),
def test_textequiv(attributes, expected_index, expected_log, caplog):
"""Test that extracting text from a PAGE TextEquiv is working without index attr."""
xml = '<?xml version="1.0"?>'
ns = ""
text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
equiv = [
"<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
for i, attr in enumerate(attributes)
textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
textline = textline.format(xml, ns, "".join(equiv))
root = ET.fromstring(textline)
result = ExtractedText.from_text_segment(
root, {"page": ns}, textequiv_level="line"
if expected_index is None:
assert not result
assert result == text[expected_index]
if expected_log is None:
assert "no_index" not in caplog.text
assert expected_log in caplog.text