️ dinglehopper: Improve performance by caching the Levensthein matrix

Motivated by [a pull
request](https://github.com/qurator-spk/dinglehopper/pull/7) by
@JKamlah, implement a cache of the Levensthein matrix calculation.

We calculated the Levenshtein matrixes for characters and words twice:
Once for the error rates, once for the alignment.
pull/29/head
Gerber, Mike 5 years ago
parent 11a6341641
commit 58ff140bc0

@ -1,21 +1,33 @@
from __future__ import division, print_function from __future__ import division, print_function
import unicodedata import unicodedata
from functools import partial from functools import partial, lru_cache
from typing import Sequence, Tuple
import numpy as np import numpy as np
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
def levenshtein_matrix(seq1, seq2): def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance. """Compute the matrix commonly computed to produce the Levenshtein distance.
This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
edit distance. edit distance.
This algorithm is implemented here because we need an implementation that can work with sequences other than This algorithm is implemented here because we need an implementation that can work with sequences other than
strings, e.g. lists of grapheme clusters or lists of word strings. strings, e.g. lists of grapheme clusters or lists of word strings.
""" """
# Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
# sequences to tuples to make them hashable.
return _levenshtein_matrix(tuple(seq1), tuple(seq2))
@lru_cache()
def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
"""Compute the matrix commonly computed to produce the Levenshtein distance.
This is a LRU cached function not meant to be used directly. Use levensthein_matrix() instead.
"""
m = len(seq1) m = len(seq1)
n = len(seq2) n = len(seq2)

@ -96,6 +96,9 @@ def test_lines_similar():
def __repr__(self): def __repr__(self):
return 'SimilarString(\'%s\')' % self._string return 'SimilarString(\'%s\')' % self._string
def __hash__(self):
return hash(self._string)
result = list(seq_align( result = list(seq_align(
[SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')], [SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')],
[SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')] [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]

Loading…
Cancel
Save