diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index c7e7733..45c4835 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -1,4 +1,5 @@ from .edit_distance import * +from rapidfuzz.string_metric import levenshtein_editops def align(t1, t2): @@ -12,7 +13,7 @@ def seq_align(s1, s2): """Align general sequences.""" s1 = list(s1) s2 = list(s2) - ops = seq_editops(s1, s2) + ops = levenshtein_editops(s1, s2) i = 0 j = 0 diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 0b9c8f4..7fa4ae1 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,79 +8,19 @@ import numpy as np from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from tqdm import tqdm +from rapidfuzz.string_metric import levenshtein, levenshtein_editops from .extracted_text import ExtractedText from .config import Config -def levenshtein_matrix(seq1: Sequence, seq2: Sequence): - """Compute the matrix commonly computed to produce the Levenshtein distance. - This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired - edit distance. - - This algorithm is implemented here because we need an implementation that can work with sequences other than - strings, e.g. lists of grapheme clusters or lists of word strings. - """ - - # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input - # sequences to tuples to make them hashable. - return _levenshtein_matrix(tuple(seq1), tuple(seq2)) - - -@lru_cache(maxsize=10) -def _levenshtein_matrix(seq1: Tuple, seq2: Tuple): - """Compute the matrix commonly computed to produce the Levenshtein distance. - - This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead. - """ - m = len(seq1) - n = len(seq2) - - def from_to(start, stop): - return range(start, stop + 1, 1) - - D = np.zeros((m + 1, n + 1), np.int) - D[0, 0] = 0 - for i in from_to(1, m): - D[i, 0] = i - for j in from_to(1, n): - D[0, j] = j - for i in tqdm(from_to(1, m), disable=not Config.progress): - for j in from_to(1, n): - D[i, j] = min( - D[i - 1, j - 1] - + 1 * (seq1[i - 1] != seq2[j - 1]), # Same or Substitution - D[i, j - 1] + 1, # Insertion - D[i - 1, j] + 1, # Deletion - ) - - return D - - -def levenshtein(seq1, seq2): - """Compute the Levenshtein edit distance between two sequences""" - m = len(seq1) - n = len(seq2) - - D = levenshtein_matrix(seq1, seq2) - return D[m, n] - - -def levenshtein_matrix_cache_clear(): - """Clear internal Levenshtein matrix cache. - - You want to do this between different input file pairs to decrease memory - usage by not caching results from prior input files. - """ - _levenshtein_matrix.cache_clear() - - @multimethod def distance(s1: str, s2: str): """Compute the Levenshtein edit distance between two Unicode strings - Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme - clusters. This should be the correct way to compare two Unicode strings. + Note that this is different from levenshtein() as this function knows about Unicode + normalization and grapheme clusters. This should be the correct way to compare two + Unicode strings. """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) @@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText): return distance(s1.text, s2.text) -def seq_editops(seq1, seq2): - """ - Return sequence of edit operations transforming one sequence to another. - - This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary - sequences. - """ - seq1 = list(seq1) - seq2 = list(seq2) - m = len(seq1) - n = len(seq2) - D = levenshtein_matrix(seq1, seq2) - - def _tail_backtrace(i, j, accumulator): - if i > 0 and D[i - 1, j] + 1 == D[i, j]: - return partial( - _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator - ) - if j > 0 and D[i, j - 1] + 1 == D[i, j]: - return partial( - _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator - ) - if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]: - return partial( - _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator - ) - if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]: - return partial(_tail_backtrace, i - 1, j - 1, accumulator) # NOP - return accumulator - - def backtrace(i, j): - result = partial(_tail_backtrace, i, j, []) - while isinstance(result, partial): - result = result() - - return result - - b = backtrace(m, n) - return b - - def editops(word1, word2): """ Return sequence of edit operations transforming one string to another. @@ -141,4 +40,4 @@ def editops(word1, word2): """ word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1))) word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2))) - return seq_editops(word1, word2) + return levenshtein_editops(word1, word2) diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index adfbbab..7c513e6 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality from pkg_resources import resource_string from .cli import process as cli_process -from .edit_distance import levenshtein_matrix_cache_clear OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) @@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor): local_filename=report_prefix + report_suffix, ) - # Clear cache between files - levenshtein_matrix_cache_clear() if __name__ == "__main__": ocrd_dinglehopper() diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index 06afbfc..8275b29 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -1,35 +1,38 @@ import unicodedata -from .. import seq_editops, editops +from rapidfuzz.string_metric import levenshtein_editops +from .. import editops +# TODO: Remove rapidfuzz.string_metric.levenshtein_editops tests eventually + def test_trivial(): - assert seq_editops("abc", "abc") == [] - assert seq_editops("", "") == [] + assert levenshtein_editops("abc", "abc") == [] + assert levenshtein_editops("", "") == [] def test_insert(): - assert seq_editops("bc", "abc") == [("insert", 0, 0)] - assert seq_editops("ac", "abc") == [("insert", 1, 1)] - assert seq_editops("ab", "abc") == [("insert", 2, 2)] - assert seq_editops("", "a") == [("insert", 0, 0)] + assert levenshtein_editops("bc", "abc") == [("insert", 0, 0)] + assert levenshtein_editops("ac", "abc") == [("insert", 1, 1)] + assert levenshtein_editops("ab", "abc") == [("insert", 2, 2)] + assert levenshtein_editops("", "a") == [("insert", 0, 0)] def test_multiple(): - assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)] + assert levenshtein_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)] def test_delete(): - assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)] - assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)] - assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)] - assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)] - assert seq_editops("Foo", "") == [ + assert levenshtein_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)] + assert levenshtein_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)] + assert levenshtein_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)] + assert levenshtein_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)] + assert levenshtein_editops("Foo", "") == [ ("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0), ] - assert seq_editops("Foolish", "Foo") == [ + assert levenshtein_editops("Foolish", "Foo") == [ ("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), @@ -38,10 +41,10 @@ def test_delete(): def test_ambiguous(): - assert seq_editops("bcd", "abcef") == [ + assert levenshtein_editops("bcd", "abcef") == [ ("insert", 0, 0), - ("replace", 2, 3), - ("insert", 3, 4), + ("insert", 2, 3), + ("replace", 2, 4), ] diff --git a/requirements.txt b/requirements.txt index 7bb53ac..02bc99f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ ocrd >= 2.20.1 attrs multimethod == 1.3 # latest version to officially support Python 3.5 tqdm +rapidfuzz >= 1.8.1