Rearrange new algo and set a limit, when to use it.

pull/7/head
JKamlah 6 years ago
parent 6ad003b015
commit 3e515933e6

@ -20,34 +20,41 @@ def levenshtein_matrix(seq1, seq2):
m = len(seq1) m = len(seq1)
n = len(seq2) n = len(seq2)
# Generate unique grapheme sets for both sequences D = np.ones((m + 1, n + 1), np.int)
seq1set = set(seq1) D[:,0] = np.arange(m+1)
seq2set = set(seq2) D[0,:] = np.arange(n+1)
if m > 10 and n > 10:
# All grapheme which occur in both sets # All grapheme which occur in both sets
interset = seq1set.intersection(seq2set) interset = set(seq1).intersection(set(seq2))
# Generate a boolean-mask for each interset grapheme # Generate a boolean-mask for each interset grapheme
masks = {grapheme:[1]*(len(seq2)+1)for grapheme in interset} masks = {grapheme: [0] * (len(seq2) + 1) for grapheme in interset}
for idx, grapheme in enumerate(seq2): for idx, grapheme in enumerate(seq2):
if grapheme in interset: if grapheme in interset:
masks[grapheme][idx] = 0 masks[grapheme][idx] = -1
D = np.ones((m + 1, n + 1), np.int)
D[:,0] = np.arange(m+1)
D[0,:] = np.arange(n+1)
# Calculate the levensthein matrix
for row, grapheme in enumerate(seq1): for row, grapheme in enumerate(seq1):
if seq1[row] in interset: if seq1[row] in interset:
mask = masks[grapheme] mask = masks[grapheme]
for col in range(0,n): for col in range(0,n):
D[row + 1, col + 1] = min(D[row, col] + mask[col], D[row + 1, col]+1, D[row, col + 1]+1) D[row + 1, col + 1] = min(D[row, col] + mask[col], D[row + 1, col], D[row, col + 1])+1
else: else:
for col in range(0,n): for col in range(0,n):
D[row+1,col+1] = min(D[row,col],D[row+1,col],D[row,col+1])+1 D[row+1,col+1] = min(D[row,col],D[row+1,col],D[row,col+1])+1
return D
else:
for i in range(1, m+1):
for j in range(1, n+1):
E[i, j] = min(
E[i - 1, j - 1] + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution
E[i, j - 1] + 1, # Insertion
E[i - 1, j] + 1 # Deletion
)
return D
def levenshtein(seq1, seq2): def levenshtein(seq1, seq2):
"""Compute the Levenshtein edit distance between two sequences""" """Compute the Levenshtein edit distance between two sequences"""

Loading…
Cancel
Save