mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-30 17:04:15 +01:00 
			
		
		
		
	Installing was broken since moving to pyproject.toml, which we didn't notice because of
leftover files in build/. Fix this by using the convention of having the source files
in src/ and adjusting pyproject.toml accordingly.
Fixes gh-86. 🤞
		
	
			
		
			
				
	
	
		
			145 lines
		
	
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			145 lines
		
	
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import logging
 | |
| import unicodedata
 | |
| from collections import namedtuple
 | |
| 
 | |
| import pytest
 | |
| from lxml import etree as ET
 | |
| from uniseg.graphemecluster import grapheme_clusters
 | |
| 
 | |
| from .. import seq_align, ExtractedText
 | |
| 
 | |
| 
 | |
| def test_text():
 | |
|     test1 = ExtractedText(
 | |
|         None,
 | |
|         [
 | |
|             ExtractedText("s0", None, None, "foo"),
 | |
|             ExtractedText("s1", None, None, "bar"),
 | |
|             ExtractedText("s2", None, None, "bazinga"),
 | |
|         ],
 | |
|         " ",
 | |
|         None,
 | |
|     )
 | |
| 
 | |
|     assert test1.text == "foo bar bazinga"
 | |
|     assert test1.segment_id_for_pos(0) == "s0"
 | |
|     assert test1.segment_id_for_pos(3) is None
 | |
|     assert test1.segment_id_for_pos(10) == "s2"
 | |
| 
 | |
| 
 | |
| def test_normalization_check():
 | |
|     with pytest.raises(ValueError, match=r".*is not in NFC.*"):
 | |
|         ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
 | |
|     assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
 | |
| 
 | |
| 
 | |
| AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
 | |
| 
 | |
| 
 | |
| def test_align():
 | |
|     """
 | |
|     Test aligning by character while retaining segment id info
 | |
| 
 | |
|     The difficulty here is that aligning should work on grapheme clusters,
 | |
|     not Python characters.
 | |
|     """
 | |
| 
 | |
|     test1 = ExtractedText(
 | |
|         None,
 | |
|         [
 | |
|             ExtractedText("s0", None, None, "foo"),
 | |
|             ExtractedText("s1", None, None, "bar"),
 | |
|             ExtractedText("s2", None, None, "batzinga"),
 | |
|         ],
 | |
|         " ",
 | |
|         None,
 | |
|     )
 | |
|     test2 = ExtractedText(
 | |
|         None,
 | |
|         [
 | |
|             ExtractedText("x0", None, None, "foo"),
 | |
|             ExtractedText("x1", None, None, "bar"),
 | |
|             # extra .
 | |
|             ExtractedText("x2", None, None, "."),
 | |
|             # deletion + different grapheme cluster, m̃ also is two Python characters
 | |
|             ExtractedText("x3", None, None, "bazim̃ga"),
 | |
|         ],
 | |
|         " ",
 | |
|         None,
 | |
|     )
 | |
| 
 | |
|     left_pos = 0
 | |
|     right_pos = 0
 | |
|     alignment = []
 | |
|     for left, right in seq_align(
 | |
|         grapheme_clusters(test1.text), grapheme_clusters(test2.text)
 | |
|     ):
 | |
|         left_id = test1.segment_id_for_pos(left_pos) if left is not None else None
 | |
|         right_id = test2.segment_id_for_pos(right_pos) if right is not None else None
 | |
|         el = AlignmentElement(left, right, left_id, right_id)
 | |
|         alignment.append(el)
 | |
|         if left is not None:
 | |
|             left_pos += len(left)
 | |
|         if right is not None:
 | |
|             right_pos += len(right)
 | |
| 
 | |
|     print("test1: {}".format(test1.text))
 | |
|     print("test2: {}".format(test2.text))
 | |
| 
 | |
|     assert alignment[0] == ("f", "f", "s0", "x0")
 | |
|     assert alignment[8] == (None, ".", None, "x2")
 | |
|     assert alignment[12] == ("t", None, "s2", None)
 | |
|     assert alignment[15] == ("n", "m̃", "s2", "x3")
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "attributes,expected_index,expected_log",
 | |
|     [
 | |
|         ([], None, None),
 | |
|         (['index="0"'], 0, None),
 | |
|         ([""], 0, None),
 | |
|         (['conf="0.5"'], 0, None),
 | |
|         (['index="1"', 'index="0"'], 1, None),
 | |
|         (['index="0" conf="0.4"', 'conf="0.5"'], 0, "TextEquiv without index"),
 | |
|         (
 | |
|             ['conf="0.4"', 'conf="0.5"', 'conf="0.9"'],
 | |
|             2,
 | |
|             "No index attributes, use 'conf' attribute to sort TextEquiv",
 | |
|         ),
 | |
|         (['index="0"', ""], 0, "TextEquiv without index"),
 | |
|         (
 | |
|             ["", 'conf="0.4"'],
 | |
|             1,
 | |
|             "No index attributes, use 'conf' attribute to sort TextEquiv",
 | |
|         ),
 | |
|         (["", ""], 0, "No index attributes, use first TextEquiv"),
 | |
|     ],
 | |
| )
 | |
| def test_textequiv(attributes, expected_index, expected_log, caplog):
 | |
|     """Test that extracting text from a PAGE TextEquiv is working without index attr."""
 | |
|     caplog.set_level(logging.INFO)
 | |
|     xml = '<?xml version="1.0"?>'
 | |
|     ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15"
 | |
|     text = ["Text {0}".format(i) for i in range(len(attributes) + 1)]
 | |
| 
 | |
|     equiv = [
 | |
|         "<TextEquiv {0}><Unicode>{1}</Unicode></TextEquiv>".format(attr, text[i])
 | |
|         for i, attr in enumerate(attributes)
 | |
|     ]
 | |
| 
 | |
|     textline = '{0}<TextLine id="l3" xmlns="{1}">{2}</TextLine>'
 | |
|     textline = textline.format(xml, ns, "".join(equiv))
 | |
| 
 | |
|     root = ET.fromstring(textline)
 | |
|     result = ExtractedText.from_text_segment(
 | |
|         root, {"page": ns}, textequiv_level="line"
 | |
|     ).text
 | |
|     if expected_index is None:
 | |
|         assert not result
 | |
|     else:
 | |
|         assert result == text[expected_index]
 | |
| 
 | |
|     if expected_log is None:
 | |
|         assert "no_index" not in caplog.text
 | |
|     else:
 | |
|         assert expected_log in caplog.text
 |