From a18b25b1633d71b1019e3952eee26c78ba6c2d12 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 27 Jan 2023 19:13:45 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Update=20tests=20for=20Extracted?= =?UTF-8?q?Text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In PR gh-72, @maxbachmann introduced a new argument for ExtractedText(). Update the corresponding tests. --- .../dinglehopper/tests/extracted_text_test.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 8a81587..bc230d6 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -13,12 +13,13 @@ def test_text(): test1 = ExtractedText( None, [ - ExtractedText("s0", None, None, "foo"), - ExtractedText("s1", None, None, "bar"), - ExtractedText("s2", None, None, "bazinga"), + ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")), + ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")), + ExtractedText("s2", None, None, "bazinga", grapheme_clusters("bazinga")), ], " ", None, + None, ) assert test1.text == "foo bar bazinga" @@ -29,8 +30,12 @@ def test_text(): def test_normalization_check(): with pytest.raises(ValueError, match=r".*is not in NFC.*"): - ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ")) - assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ")) + ExtractedText("foo", None, None, + unicodedata.normalize("NFD", "Schlyñ"), + grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ"))) + assert ExtractedText("foo", None, None, + unicodedata.normalize("NFC", "Schlyñ"), + grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ"))) AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id") @@ -47,25 +52,27 @@ def test_align(): test1 = ExtractedText( None, [ - ExtractedText("s0", None, None, "foo"), - ExtractedText("s1", None, None, "bar"), - ExtractedText("s2", None, None, "batzinga"), + ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")), + ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")), + ExtractedText("s2", None, None, "batzinga", grapheme_clusters("batzinga")), ], " ", None, + None, ) test2 = ExtractedText( None, [ - ExtractedText("x0", None, None, "foo"), - ExtractedText("x1", None, None, "bar"), + ExtractedText("x0", None, None, "foo", grapheme_clusters("foo")), + ExtractedText("x1", None, None, "bar", grapheme_clusters("bar")), # extra . - ExtractedText("x2", None, None, "."), + ExtractedText("x2", None, None, ".", grapheme_clusters(".")), # deletion + different grapheme cluster, m̃ also is two Python characters - ExtractedText("x3", None, None, "bazim̃ga"), + ExtractedText("x3", None, None, "bazim̃ga", grapheme_clusters("bazim̃ga")), ], " ", None, + None, ) left_pos = 0