import logging import unicodedata from collections import namedtuple import pytest from lxml import etree as ET from uniseg.graphemecluster import grapheme_clusters from .. import seq_align, ExtractedText def test_text(): test1 = ExtractedText( None, [ ExtractedText("s0", None, None, "foo"), ExtractedText("s1", None, None, "bar"), ExtractedText("s2", None, None, "bazinga"), ], " ", None, ) assert test1.text == "foo bar bazinga" assert test1.segment_id_for_pos(0) == "s0" assert test1.segment_id_for_pos(3) is None assert test1.segment_id_for_pos(10) == "s2" def test_normalization_check(): with pytest.raises(ValueError, match=r".*is not in NFC.*"): ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ")) assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ")) AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id") def test_align(): """ Test aligning by character while retaining segment id info The difficulty here is that aligning should work on grapheme clusters, not Python characters. """ test1 = ExtractedText( None, [ ExtractedText("s0", None, None, "foo"), ExtractedText("s1", None, None, "bar"), ExtractedText("s2", None, None, "batzinga"), ], " ", None, ) test2 = ExtractedText( None, [ ExtractedText("x0", None, None, "foo"), ExtractedText("x1", None, None, "bar"), # extra . ExtractedText("x2", None, None, "."), # deletion + different grapheme cluster, m̃ also is two Python characters ExtractedText("x3", None, None, "bazim̃ga"), ], " ", None, ) left_pos = 0 right_pos = 0 alignment = [] for left, right in seq_align( grapheme_clusters(test1.text), grapheme_clusters(test2.text) ): left_id = test1.segment_id_for_pos(left_pos) if left is not None else None right_id = test2.segment_id_for_pos(right_pos) if right is not None else None el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: left_pos += len(left) if right is not None: right_pos += len(right) print("test1: {}".format(test1.text)) print("test2: {}".format(test2.text)) assert alignment[0] == ("f", "f", "s0", "x0") assert alignment[8] == (None, ".", None, "x2") assert alignment[12] == ("t", None, "s2", None) assert alignment[15] == ("n", "m̃", "s2", "x3") @pytest.mark.parametrize( "attributes,expected_index,expected_log", [ ([], None, None), (['index="0"'], 0, None), ([""], 0, None), (['conf="0.5"'], 0, None), (['index="1"', 'index="0"'], 1, None), (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"), ( ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'], 2, "No index attributes, use 'conf' attribute to sort TextEquiv", ), (['index="0"', ""], 0, "TextEquiv without index"), ( ["", 'conf="0.4"'], 1, "No index attributes, use 'conf' attribute to sort TextEquiv", ), (["", ""], 0, "No index attributes, use first TextEquiv"), ], ) def test_textequiv(attributes, expected_index, expected_log, caplog): """Test that extracting text from a PAGE TextEquiv is working without index attr.""" caplog.set_level(logging.INFO) xml = '' ns = "" text = ["Text {0}".format(i) for i in range(len(attributes) + 1)] equiv = [ "{1}".format(attr, text[i]) for i, attr in enumerate(attributes) ] textline = '{0}{2}' textline = textline.format(xml, ns, "".join(equiv)) root = ET.fromstring(textline) result = ExtractedText.from_text_segment( root, {"page": ns}, textequiv_level="line" ).text if expected_index is None: assert not result else: assert result == text[expected_index] if expected_log is None: assert "no_index" not in caplog.text else: assert expected_log in caplog.text