Switch between c and own implementation for distance and editops.
parent
11916c2dcf
commit
0e263cfac2
@ -1,63 +1,86 @@
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from .. import seq_editops, editops
|
import pytest
|
||||||
|
|
||||||
|
from .. import editops, editops_fast
|
||||||
def test_trivial():
|
|
||||||
assert seq_editops("abc", "abc") == []
|
TEST_PARAMS = "s1,s2,expected_ops"
|
||||||
assert seq_editops("", "") == []
|
|
||||||
|
TEST_STRINGS = [
|
||||||
|
# trivial
|
||||||
|
("abc", "abc", []),
|
||||||
|
("", "", []),
|
||||||
|
# insert
|
||||||
|
("bc", "abc", [("insert", 0, 0)]),
|
||||||
|
("ac", "abc", [("insert", 1, 1)]),
|
||||||
|
("ab", "abc", [("insert", 2, 2)]),
|
||||||
|
("", "a", [("insert", 0, 0)]),
|
||||||
|
# delete
|
||||||
|
("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
|
||||||
|
("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
|
||||||
|
("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
|
||||||
|
("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
|
||||||
|
("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
|
||||||
|
(
|
||||||
|
"Foolish",
|
||||||
|
"Foo",
|
||||||
|
[("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
|
||||||
|
),
|
||||||
|
# multiple
|
||||||
|
("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
|
||||||
|
# ambiguous
|
||||||
|
("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_SEQUENCES = [
|
||||||
|
(["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
|
||||||
|
(["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_UNICODE = [
|
||||||
|
# In these cases, one of the words has a composed form, the other one does not.
|
||||||
|
("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
|
||||||
|
("oͤde", "öde", [("replace", 0, 0)]),
|
||||||
|
# equal
|
||||||
|
(
|
||||||
|
unicodedata.lookup("LATIN SMALL LETTER N")
|
||||||
|
+ unicodedata.lookup("COMBINING TILDE"),
|
||||||
|
unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
|
||||||
|
[],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_insert():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
def test_editops_strings(s1, s2, expected_ops):
|
||||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
ops = editops(s1, s2)
|
||||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
assert ops == expected_ops
|
||||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple():
|
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
def test_editops_sequences(s1, s2, expected_ops):
|
||||||
|
ops = editops(s1, s2)
|
||||||
|
assert ops == expected_ops
|
||||||
|
|
||||||
|
|
||||||
def test_delete():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
def test_editops_fast(s1, s2, expected_ops):
|
||||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
ops = editops_fast(s1, s2)
|
||||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
assert ops == expected_ops
|
||||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
|
||||||
assert seq_editops("Foo", "") == [
|
|
||||||
("delete", 0, 0),
|
|
||||||
("delete", 1, 0),
|
|
||||||
("delete", 2, 0),
|
|
||||||
]
|
|
||||||
assert seq_editops("Foolish", "Foo") == [
|
|
||||||
("delete", 3, 3),
|
|
||||||
("delete", 4, 3),
|
|
||||||
("delete", 5, 3),
|
|
||||||
("delete", 6, 3),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_ambiguous():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
assert seq_editops("bcd", "abcef") == [
|
def test_editops_fast_unicode(s1, s2, expected_ops):
|
||||||
("insert", 0, 0),
|
ops = editops_fast(s1, s2)
|
||||||
("replace", 2, 3),
|
assert ops != expected_ops
|
||||||
("insert", 3, 4),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_editops():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
|
def test_editops_unicode(s1, s2, expected_ops):
|
||||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||||
|
|
||||||
# In these cases, one of the words has a composed form, the other one does not.
|
if not expected_ops:
|
||||||
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
|
assert s1 != s2
|
||||||
assert editops("oͤde", "öde") == [("replace", 0, 0)]
|
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
|
||||||
|
ops = editops(s1, s2)
|
||||||
|
assert ops == expected_ops
|
||||||
def test_editops_canonically_equivalent():
|
|
||||||
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
|
|
||||||
"COMBINING TILDE"
|
|
||||||
)
|
|
||||||
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
|
|
||||||
assert left != right
|
|
||||||
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
|
|
||||||
assert editops(left, right) == []
|
|
||||||
|
Loading…
Reference in New Issue