mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
⚡️ dinglehopper: Improve performance by caching the Levensthein matrix
Motivated by [a pull request](https://github.com/qurator-spk/dinglehopper/pull/7) by @JKamlah, implement a cache of the Levensthein matrix calculation. We calculated the Levenshtein matrixes for characters and words twice: Once for the error rates, once for the alignment.
This commit is contained in:
parent
11a6341641
commit
58ff140bc0
2 changed files with 18 additions and 3 deletions
|
@ -1,21 +1,33 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
from functools import partial
|
||||
from functools import partial, lru_cache
|
||||
from typing import Sequence, Tuple
|
||||
|
||||
import numpy as np
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
|
||||
def levenshtein_matrix(seq1, seq2):
|
||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
|
||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||
edit distance.
|
||||
|
||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
||||
"""
|
||||
|
||||
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
||||
# sequences to tuples to make them hashable.
|
||||
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
|
||||
This is a LRU cached function not meant to be used directly. Use levensthein_matrix() instead.
|
||||
"""
|
||||
m = len(seq1)
|
||||
n = len(seq2)
|
||||
|
||||
|
|
|
@ -96,6 +96,9 @@ def test_lines_similar():
|
|||
def __repr__(self):
|
||||
return 'SimilarString(\'%s\')' % self._string
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._string)
|
||||
|
||||
result = list(seq_align(
|
||||
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
|
||||
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue