ADD a new levenshtein matrix calculation.

pull/7/head
JKamlah 6 years ago
parent 29a2c8218f
commit 6ad003b015

@ -16,26 +16,36 @@ def levenshtein_matrix(seq1, seq2):
This algorithm is implemented here because we need an implementation that can work with sequences other than This algorithm is implemented here because we need an implementation that can work with sequences other than
strings, e.g. lists of grapheme clusters or lists of word strings. strings, e.g. lists of grapheme clusters or lists of word strings.
""" """
m = len(seq1) m = len(seq1)
n = len(seq2) n = len(seq2)
def from_to(start, stop): # Generate unique grapheme sets for both sequences
return range(start, stop + 1, 1) seq1set = set(seq1)
seq2set = set(seq2)
D = np.zeros((m + 1, n + 1), np.int)
D[0, 0] = 0 # All grapheme which occur in both sets
for i in from_to(1, m): interset = seq1set.intersection(seq2set)
D[i, 0] = i
for j in from_to(1, n): # Generate a boolean-mask for each interset grapheme
D[0, j] = j masks = {grapheme:[1]*(len(seq2)+1)for grapheme in interset}
for i in from_to(1, m):
for j in from_to(1, n): for idx, grapheme in enumerate(seq2):
D[i, j] = min( if grapheme in interset:
D[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution masks[grapheme][idx] = 0
D[i, j - 1] + 1, # Insertion
D[i - 1, j] + 1 # Deletion D = np.ones((m + 1, n + 1), np.int)
) D[:,0] = np.arange(m+1)
D[0,:] = np.arange(n+1)
for row, grapheme in enumerate(seq1):
if seq1[row] in interset:
mask = masks[grapheme]
for col in range(0,n):
D[row + 1, col + 1] = min(D[row, col] + mask[col], D[row + 1, col]+1, D[row, col + 1]+1)
else:
for col in range(0,n):
D[row+1,col+1] = min(D[row,col],D[row+1,col],D[row,col+1])+1
return D return D

Loading…
Cancel
Save