mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 22:50:08 +02:00
⚡️ dinglehopper: Improve performance by caching the Levensthein matrix
Motivated by [a pull request](https://github.com/qurator-spk/dinglehopper/pull/7) by @JKamlah, implement a cache of the Levensthein matrix calculation. We calculated the Levenshtein matrixes for characters and words twice: Once for the error rates, once for the alignment.
This commit is contained in:
parent
11a6341641
commit
58ff140bc0
2 changed files with 18 additions and 3 deletions
|
@ -1,21 +1,33 @@
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from functools import partial
|
from functools import partial, lru_cache
|
||||||
|
from typing import Sequence, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
|
||||||
def levenshtein_matrix(seq1, seq2):
|
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||||
|
|
||||||
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
|
||||||
edit distance.
|
edit distance.
|
||||||
|
|
||||||
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
This algorithm is implemented here because we need an implementation that can work with sequences other than
|
||||||
strings, e.g. lists of grapheme clusters or lists of word strings.
|
strings, e.g. lists of grapheme clusters or lists of word strings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
|
||||||
|
# sequences to tuples to make them hashable.
|
||||||
|
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
|
||||||
|
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||||
|
|
||||||
|
This is a LRU cached function not meant to be used directly. Use levensthein_matrix() instead.
|
||||||
|
"""
|
||||||
m = len(seq1)
|
m = len(seq1)
|
||||||
n = len(seq2)
|
n = len(seq2)
|
||||||
|
|
||||||
|
|
|
@ -96,6 +96,9 @@ def test_lines_similar():
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'SimilarString(\'%s\')' % self._string
|
return 'SimilarString(\'%s\')' % self._string
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash(self._string)
|
||||||
|
|
||||||
result = list(seq_align(
|
result = list(seq_align(
|
||||||
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
|
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
|
||||||
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue