Switch between c and own implementation for distance and editops.

pull/48/head
Benjamin Rosemann 4 years ago
parent 11916c2dcf
commit 0e263cfac2

@ -12,7 +12,7 @@ def seq_align(s1, s2):
"""Align general sequences."""
s1 = list(s1)
s2 = list(s2)
ops = seq_editops(s1, s2)
ops = editops(s1, s2)
i = 0
j = 0

@ -2,9 +2,11 @@ from __future__ import division, print_function
import unicodedata
from functools import partial, lru_cache
from itertools import chain
from typing import Sequence, Tuple, List
import numpy as np
from Levenshtein import editops as c_editops, distance as c_distance
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm
@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear():
def distance(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings.
Note that this is different from levenshtein() as this function knows about Unicode
normalization and grapheme clusters.
This should be the correct way to compare two Unicode strings.
"""
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if any(len(s) > 1 for s in chain(seq1, seq2)):
return distance(seq1, seq2)
else:
return distance_fast("".join(seq1), "".join(seq2))
@multimethod
@ -97,12 +104,24 @@ def distance(s1: List, s2: List):
return levenshtein(s1, s2)
def seq_editops(seq1, seq2):
def distance_fast(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
Also see `distance()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
return c_distance(s1, s2)
@multimethod
def editops(seq1: List, seq2: List):
"""
Return sequence of edit operations transforming one sequence to another.
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
sequences.
This aims to return the same/similar results as python-Levenshtein's editops(),
just generalized to arbitrary sequences.
"""
seq1 = list(seq1)
seq2 = list(seq2)
@ -138,12 +157,27 @@ def seq_editops(seq1, seq2):
return b
def editops(word1, word2):
@multimethod
def editops(s1: str, s2: str):
"""
Return sequence of edit operations transforming one string to another.
Note that this returns indices to the _grapheme clusters_, not characters!
"""
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
return seq_editops(word1, word2)
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
if any(len(s) > 1 for s in chain(s1, s2)):
return editops(s1, s2)
else:
return editops_fast("".join(s1), "".join(s2))
def editops_fast(s1: str, s2: str):
"""Return sequence of edit operations transforming one string to another.
Also see `editops()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
return c_editops(s1, s2)

@ -1,13 +1,11 @@
from __future__ import division, print_function
import unicodedata
import pytest
from .. import levenshtein, distance
from .. import distance, distance_fast
TEST_PARAMS = "seq1,seq2,expected_dist"
TEST_PARAMS = "s1,s2,expected_dist"
TEST_STRINGS = [
("a", "a", 0),
@ -24,30 +22,45 @@ TEST_STRINGS = [
TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]
TEST_UNICODE = [
# Different, decomposed!
(unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0),
# Same decomposition
(
# This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
"Schlyñ",
# This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
"Schlym̃",
1,
),
]
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
def test_distance_sequences(seq1, seq2, expected_dist):
dist = distance(seq1, seq2)
def test_distance_sequences(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance_strings(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance(seq1, seq2, expected_dist):
dist = distance(seq1, seq2)
def test_distance_fast(s1, s2, expected_dist):
dist = distance_fast(s1, s2)
assert dist == expected_dist
def test_distance_unicode_wide():
word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
assert distance(word1, word2) == 0
word1 = "Schlyñ"
assert (
len(word1) == 6
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
word2 = "Schlym̃"
assert (
len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_fast_unicode(s1, s2, expected_dist):
dist = distance_fast(s1, s2)
assert dist != expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_distance_unicode(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist == expected_dist

@ -1,63 +1,86 @@
import unicodedata
from .. import seq_editops, editops
import pytest
from .. import editops, editops_fast
TEST_PARAMS = "s1,s2,expected_ops"
TEST_STRINGS = [
# trivial
("abc", "abc", []),
("", "", []),
# insert
("bc", "abc", [("insert", 0, 0)]),
("ac", "abc", [("insert", 1, 1)]),
("ab", "abc", [("insert", 2, 2)]),
("", "a", [("insert", 0, 0)]),
# delete
("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
(
"Foolish",
"Foo",
[("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
),
# multiple
("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
# ambiguous
("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
]
TEST_SEQUENCES = [
(["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
(["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
]
def test_trivial():
assert seq_editops("abc", "abc") == []
assert seq_editops("", "") == []
TEST_UNICODE = [
# In these cases, one of the words has a composed form, the other one does not.
("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
("oͤde", "öde", [("replace", 0, 0)]),
# equal
(
unicodedata.lookup("LATIN SMALL LETTER N")
+ unicodedata.lookup("COMBINING TILDE"),
unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
[],
),
]
def test_insert():
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
assert seq_editops("", "a") == [("insert", 0, 0)]
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_editops_strings(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops == expected_ops
def test_multiple():
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
def test_editops_sequences(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops == expected_ops
def test_delete():
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
assert seq_editops("Foo", "") == [
("delete", 0, 0),
("delete", 1, 0),
("delete", 2, 0),
]
assert seq_editops("Foolish", "Foo") == [
("delete", 3, 3),
("delete", 4, 3),
("delete", 5, 3),
("delete", 6, 3),
]
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_editops_fast(s1, s2, expected_ops):
ops = editops_fast(s1, s2)
assert ops == expected_ops
def test_ambiguous():
assert seq_editops("bcd", "abcef") == [
("insert", 0, 0),
("replace", 2, 3),
("insert", 3, 4),
]
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_fast_unicode(s1, s2, expected_ops):
ops = editops_fast(s1, s2)
assert ops != expected_ops
def test_editops():
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_unicode(s1, s2, expected_ops):
"""Test editops() in cases where dealing with grapheme clusters matters"""
# In these cases, one of the words has a composed form, the other one does not.
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
assert editops("oͤde", "öde") == [("replace", 0, 0)]
def test_editops_canonically_equivalent():
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
"COMBINING TILDE"
)
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
assert left != right
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
assert editops(left, right) == []
if not expected_ops:
assert s1 != s2
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
ops = editops(s1, s2)
assert ops == expected_ops

@ -6,7 +6,7 @@ from multimethod import multimethod
import uniseg.wordbreak
from .edit_distance import levenshtein
from .edit_distance import distance
from . import ExtractedText
@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
reference_seq = list(reference)
compared_seq = list(compared)
d = levenshtein(reference_seq, compared_seq)
d = distance(reference_seq, compared_seq)
n = len(reference_seq)
if d == 0:

Loading…
Cancel
Save