Switch between c and own implementation for distance and editops.
parent
11916c2dcf
commit
0e263cfac2
@ -1,63 +1,86 @@
|
||||
import unicodedata
|
||||
|
||||
from .. import seq_editops, editops
|
||||
|
||||
|
||||
def test_trivial():
|
||||
assert seq_editops("abc", "abc") == []
|
||||
assert seq_editops("", "") == []
|
||||
import pytest
|
||||
|
||||
from .. import editops, editops_fast
|
||||
|
||||
TEST_PARAMS = "s1,s2,expected_ops"
|
||||
|
||||
TEST_STRINGS = [
|
||||
# trivial
|
||||
("abc", "abc", []),
|
||||
("", "", []),
|
||||
# insert
|
||||
("bc", "abc", [("insert", 0, 0)]),
|
||||
("ac", "abc", [("insert", 1, 1)]),
|
||||
("ab", "abc", [("insert", 2, 2)]),
|
||||
("", "a", [("insert", 0, 0)]),
|
||||
# delete
|
||||
("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
|
||||
("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
|
||||
("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
|
||||
("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
|
||||
("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
|
||||
(
|
||||
"Foolish",
|
||||
"Foo",
|
||||
[("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
|
||||
),
|
||||
# multiple
|
||||
("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
|
||||
# ambiguous
|
||||
("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
|
||||
]
|
||||
|
||||
TEST_SEQUENCES = [
|
||||
(["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
|
||||
(["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
|
||||
]
|
||||
|
||||
TEST_UNICODE = [
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
|
||||
("oͤde", "öde", [("replace", 0, 0)]),
|
||||
# equal
|
||||
(
|
||||
unicodedata.lookup("LATIN SMALL LETTER N")
|
||||
+ unicodedata.lookup("COMBINING TILDE"),
|
||||
unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
|
||||
[],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_insert():
|
||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||
def test_editops_strings(s1, s2, expected_ops):
|
||||
ops = editops(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
||||
|
||||
def test_multiple():
|
||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||
def test_editops_sequences(s1, s2, expected_ops):
|
||||
ops = editops(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
||||
|
||||
def test_delete():
|
||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
||||
assert seq_editops("Foo", "") == [
|
||||
("delete", 0, 0),
|
||||
("delete", 1, 0),
|
||||
("delete", 2, 0),
|
||||
]
|
||||
assert seq_editops("Foolish", "Foo") == [
|
||||
("delete", 3, 3),
|
||||
("delete", 4, 3),
|
||||
("delete", 5, 3),
|
||||
("delete", 6, 3),
|
||||
]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||
def test_editops_fast(s1, s2, expected_ops):
|
||||
ops = editops_fast(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
||||
|
||||
def test_ambiguous():
|
||||
assert seq_editops("bcd", "abcef") == [
|
||||
("insert", 0, 0),
|
||||
("replace", 2, 3),
|
||||
("insert", 3, 4),
|
||||
]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||
def test_editops_fast_unicode(s1, s2, expected_ops):
|
||||
ops = editops_fast(s1, s2)
|
||||
assert ops != expected_ops
|
||||
|
||||
|
||||
def test_editops():
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||
def test_editops_unicode(s1, s2, expected_ops):
|
||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
|
||||
assert editops("oͤde", "öde") == [("replace", 0, 0)]
|
||||
|
||||
|
||||
def test_editops_canonically_equivalent():
|
||||
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
|
||||
"COMBINING TILDE"
|
||||
)
|
||||
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
|
||||
assert left != right
|
||||
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
|
||||
assert editops(left, right) == []
|
||||
if not expected_ops:
|
||||
assert s1 != s2
|
||||
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
|
||||
ops = editops(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
Loading…
Reference in New Issue