|
|
|
@ -12,15 +12,16 @@ from .normalize import chars_normalized
|
|
|
|
|
|
|
|
|
|
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
|
|
|
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
|
|
|
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
|
|
|
|
edit distance.
|
|
|
|
|
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom
|
|
|
|
|
right contains the desired edit distance.
|
|
|
|
|
|
|
|
|
|
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
|
|
|
|
strings, e.g. lists of grapheme clusters or lists of word strings.
|
|
|
|
|
This algorithm is implemented here because we need an implementation that can work
|
|
|
|
|
with sequences other than strings, e.g. lists of grapheme clusters or lists of word
|
|
|
|
|
strings.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
|
|
|
|
# sequences to tuples to make them hashable.
|
|
|
|
|
# Internally, we use a cached version. As the cache only works on hashable
|
|
|
|
|
# parameters, we convert the input sequences to tuples to make them hashable.
|
|
|
|
|
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -28,7 +29,8 @@ def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
|
|
|
|
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
|
|
|
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
|
|
|
|
|
|
|
|
|
This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
|
|
|
|
|
This is a LRU cached function not meant to be used directly.
|
|
|
|
|
Use levenshtein_matrix() instead.
|
|
|
|
|
"""
|
|
|
|
|
m = len(seq1)
|
|
|
|
|
n = len(seq2)
|
|
|
|
@ -36,7 +38,7 @@ def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
|
|
|
|
def from_to(start, stop):
|
|
|
|
|
return range(start, stop + 1, 1)
|
|
|
|
|
|
|
|
|
|
D = np.zeros((m + 1, n + 1), np.int)
|
|
|
|
|
D = np.zeros((m + 1, n + 1), int)
|
|
|
|
|
D[0, 0] = 0
|
|
|
|
|
for i in from_to(1, m):
|
|
|
|
|
D[i, 0] = i
|
|
|
|
@ -75,8 +77,9 @@ def levenshtein_matrix_cache_clear():
|
|
|
|
|
def distance(s1: str, s2: str):
|
|
|
|
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
|
|
|
|
|
|
|
|
|
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
|
|
|
|
clusters. This should be the correct way to compare two Unicode strings.
|
|
|
|
|
Note that this is different from levenshtein() as this function knows about Unicode
|
|
|
|
|
normalization and grapheme clusters. This should be the correct way to compare two
|
|
|
|
|
Unicode strings.
|
|
|
|
|
"""
|
|
|
|
|
seq1 = chars_normalized(s1)
|
|
|
|
|
seq2 = chars_normalized(s2)
|
|
|
|
@ -87,8 +90,8 @@ def seq_editops(seq1, seq2):
|
|
|
|
|
"""
|
|
|
|
|
Return sequence of edit operations transforming one sequence to another.
|
|
|
|
|
|
|
|
|
|
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
|
|
|
|
|
sequences.
|
|
|
|
|
This aims to return the same/similar results as python-Levenshtein's editops(),
|
|
|
|
|
just generalized to arbitrary sequences.
|
|
|
|
|
"""
|
|
|
|
|
seq1 = list(seq1)
|
|
|
|
|
seq2 = list(seq2)
|
|
|
|
|