From 58ff140bc013702901eca037b1358dc574dc88e1 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 18 Nov 2019 15:33:17 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20dinglehopper:=20Improve=20?= =?UTF-8?q?performance=20by=20caching=20the=20Levensthein=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Motivated by [a pull request](https://github.com/qurator-spk/dinglehopper/pull/7) by @JKamlah, implement a cache of the Levensthein matrix calculation. We calculated the Levenshtein matrixes for characters and words twice: Once for the error rates, once for the alignment. --- qurator/dinglehopper/edit_distance.py | 18 +++++++++++++++--- qurator/dinglehopper/tests/test_align.py | 3 +++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 7322563..fec2bca 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -1,21 +1,33 @@ from __future__ import division, print_function import unicodedata -from functools import partial +from functools import partial, lru_cache +from typing import Sequence, Tuple import numpy as np from uniseg.graphemecluster import grapheme_clusters -def levenshtein_matrix(seq1, seq2): +def levenshtein_matrix(seq1: Sequence, seq2: Sequence): """Compute the matrix commonly computed to produce the Levenshtein distance. - This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired edit distance. This algorithm is implemented here because we need an implementation that can work with sequences other than strings, e.g. lists of grapheme clusters or lists of word strings. """ + + # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input + # sequences to tuples to make them hashable. + return _levenshtein_matrix(tuple(seq1), tuple(seq2)) + + +@lru_cache() +def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): + """Compute the matrix commonly computed to produce the Levenshtein distance. + + This is a LRU cached function not meant to be used directly. Use levensthein_matrix() instead. + """ m = len(seq1) n = len(seq2) diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py index 339a890..cc5cb43 100644 --- a/qurator/dinglehopper/tests/test_align.py +++ b/qurator/dinglehopper/tests/test_align.py @@ -96,6 +96,9 @@ def test_lines_similar(): def __repr__(self): return 'SimilarString(\'%s\')' % self._string + def __hash__(self): + return hash(self._string) + result = list(seq_align( [SimilarString('This is a line.'), SimilarString('This is another'), SimilarString('And the last line')], [SimilarString('This is a ljne.'), SimilarString('This is another'), SimilarString('J u n k'), SimilarString('And the last line')]