⚡ dinglehopper: Use rapidfuzz for editops

2025-08-17 05:30:04 +02:00 · 2021-10-22 15:38:59 +02:00 · 2021-10-22 15:38:59 +02:00 · af8da1d716
commit af8da1d716
parent 249787686f
5 changed files with 28 additions and 127 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,4 +1,5 @@
 from .edit_distance import *
+from rapidfuzz.string_metric import levenshtein_editops


 def align(t1, t2):
@ -12,7 +13,7 @@ def seq_align(s1, s2):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
-    ops = seq_editops(s1, s2)
+    ops = levenshtein_editops(s1, s2)
    i = 0
    j = 0

--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -8,79 +8,19 @@ import numpy as np
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from tqdm import tqdm
+from rapidfuzz.string_metric import levenshtein, levenshtein_editops

 from .extracted_text import ExtractedText
 from .config import Config


-def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
-    """Compute the matrix commonly computed to produce the Levenshtein distance.
-    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
-    edit distance.
-
-    This algorithm is implemented here because we need an implementation that can work with sequences other than
-    strings, e.g. lists of grapheme clusters or lists of word strings.
-    """
-
-    # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
-    # sequences to tuples to make them hashable.
-    return _levenshtein_matrix(tuple(seq1), tuple(seq2))
-
-
-@lru_cache(maxsize=10)
-def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
-    """Compute the matrix commonly computed to produce the Levenshtein distance.
-
-    This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
-    """
-    m = len(seq1)
-    n = len(seq2)
-
-    def from_to(start, stop):
-        return range(start, stop + 1, 1)
-
-    D = np.zeros((m + 1, n + 1), np.int)
-    D[0, 0] = 0
-    for i in from_to(1, m):
-        D[i, 0] = i
-    for j in from_to(1, n):
-        D[0, j] = j
-    for i in tqdm(from_to(1, m), disable=not Config.progress):
-        for j in from_to(1, n):
-            D[i, j] = min(
-                D[i - 1, j - 1]
-                + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
-                D[i, j - 1] + 1,  # Insertion
-                D[i - 1, j] + 1,  # Deletion
-            )
-
-    return D
-
-
-def levenshtein(seq1, seq2):
-    """Compute the Levenshtein edit distance between two sequences"""
-    m = len(seq1)
-    n = len(seq2)
-
-    D = levenshtein_matrix(seq1, seq2)
-    return D[m, n]
-
-
-def levenshtein_matrix_cache_clear():
-    """Clear internal Levenshtein matrix cache.
-
-    You want to do this between different input file pairs to decrease memory
-    usage by not caching results from prior input files.
-    """
-    _levenshtein_matrix.cache_clear()
-
-
@multimethod
 def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

-    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
-    clusters. This should be the correct way to compare two Unicode strings.
+    Note that this is different from levenshtein() as this function knows about Unicode
+    normalization and grapheme clusters. This should be the correct way to compare two
+    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText):
    return distance(s1.text, s2.text)


-def seq_editops(seq1, seq2):
-    """
-    Return sequence of edit operations transforming one sequence to another.
-
-    This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
-    sequences.
-    """
-    seq1 = list(seq1)
-    seq2 = list(seq2)
-    m = len(seq1)
-    n = len(seq2)
-    D = levenshtein_matrix(seq1, seq2)
-
-    def _tail_backtrace(i, j, accumulator):
-        if i > 0 and D[i - 1, j] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
-            )
-        if j > 0 and D[i, j - 1] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
-            )
-        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
-            )
-        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
-            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP
-        return accumulator
-
-    def backtrace(i, j):
-        result = partial(_tail_backtrace, i, j, [])
-        while isinstance(result, partial):
-            result = result()
-
-        return result
-
-    b = backtrace(m, n)
-    return b
-
-
 def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.
@ -141,4 +40,4 @@ def editops(word1, word2):
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
-    return seq_editops(word1, word2)
+    return levenshtein_editops(word1, word2)
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
 from pkg_resources import resource_string

 from .cli import process as cli_process
-from .edit_distance import levenshtein_matrix_cache_clear

 OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))

@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
                    local_filename=report_prefix + report_suffix,
                )

-            # Clear cache between files
-            levenshtein_matrix_cache_clear()

 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,35 +1,38 @@
 import unicodedata

-from .. import seq_editops, editops
+from rapidfuzz.string_metric import levenshtein_editops
+from .. import editops


+# TODO: Remove rapidfuzz.string_metric.levenshtein_editops tests eventually
+
 def test_trivial():
-    assert seq_editops("abc", "abc") == []
-    assert seq_editops("", "") == []
+    assert levenshtein_editops("abc", "abc") == []
+    assert levenshtein_editops("", "") == []


 def test_insert():
-    assert seq_editops("bc", "abc") == [("insert", 0, 0)]
-    assert seq_editops("ac", "abc") == [("insert", 1, 1)]
-    assert seq_editops("ab", "abc") == [("insert", 2, 2)]
-    assert seq_editops("", "a") == [("insert", 0, 0)]
+    assert levenshtein_editops("bc", "abc") == [("insert", 0, 0)]
+    assert levenshtein_editops("ac", "abc") == [("insert", 1, 1)]
+    assert levenshtein_editops("ab", "abc") == [("insert", 2, 2)]
+    assert levenshtein_editops("", "a") == [("insert", 0, 0)]


 def test_multiple():
-    assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
+    assert levenshtein_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]


 def test_delete():
-    assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
-    assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
-    assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
-    assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
-    assert seq_editops("Foo", "") == [
+    assert levenshtein_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
+    assert levenshtein_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
+    assert levenshtein_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
+    assert levenshtein_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
+    assert levenshtein_editops("Foo", "") == [
        ("delete", 0, 0),
        ("delete", 1, 0),
        ("delete", 2, 0),
    ]
-    assert seq_editops("Foolish", "Foo") == [
+    assert levenshtein_editops("Foolish", "Foo") == [
        ("delete", 3, 3),
        ("delete", 4, 3),
        ("delete", 5, 3),
@ -38,10 +41,10 @@ def test_delete():


 def test_ambiguous():
-    assert seq_editops("bcd", "abcef") == [
+    assert levenshtein_editops("bcd", "abcef") == [
        ("insert", 0, 0),
-        ("replace", 2, 3),
-        ("insert", 3, 4),
+        ("insert", 2, 3),
+        ("replace", 2, 4),
    ]


--- a/requirements.txt
+++ b/requirements.txt
@ -9,3 +9,4 @@ ocrd >= 2.20.1
 attrs
 multimethod == 1.3  # latest version to officially support Python 3.5
 tqdm
+rapidfuzz >= 1.8.1