⚡ dinglehopper: Use rapidfuzz for editops

2025-07-05 16:39:59 +02:00 · 2021-10-22 15:38:59 +02:00 · 2021-10-22 15:38:59 +02:00 · af8da1d716
commit af8da1d716
parent 249787686f
5 changed files with 28 additions and 127 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,4 +1,5 @@
 from .edit_distance import *
 from rapidfuzz.string_metric import levenshtein_editops
 def align(t1, t2):
@ -12,7 +13,7 @@ def seq_align(s1, s2):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
-    ops = seq_editops(s1, s2)
+    ops = levenshtein_editops(s1, s2)
    i = 0
    j = 0
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -8,79 +8,19 @@ import numpy as np
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from tqdm import tqdm
 from rapidfuzz.string_metric import levenshtein, levenshtein_editops
 from .extracted_text import ExtractedText
 from .config import Config
 def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
    """Compute the matrix commonly computed to produce the Levenshtein distance.
    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
    edit distance.
    This algorithm is implemented here because we need an implementation that can work with sequences other than
    strings, e.g. lists of grapheme clusters or lists of word strings.
    """
    # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
    # sequences to tuples to make them hashable.
    return _levenshtein_matrix(tuple(seq1), tuple(seq2))
@lru_cache(maxsize=10)
 def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
    """Compute the matrix commonly computed to produce the Levenshtein distance.
    This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
    """
    m = len(seq1)
    n = len(seq2)
    def from_to(start, stop):
        return range(start, stop + 1, 1)
    D = np.zeros((m + 1, n + 1), np.int)
    D[0, 0] = 0
    for i in from_to(1, m):
        D[i, 0] = i
    for j in from_to(1, n):
        D[0, j] = j
    for i in tqdm(from_to(1, m), disable=not Config.progress):
        for j in from_to(1, n):
            D[i, j] = min(
                D[i - 1, j - 1]
                + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
                D[i, j - 1] + 1,  # Insertion
                D[i - 1, j] + 1,  # Deletion
            )
    return D
 def levenshtein(seq1, seq2):
    """Compute the Levenshtein edit distance between two sequences"""
    m = len(seq1)
    n = len(seq2)
    D = levenshtein_matrix(seq1, seq2)
    return D[m, n]
 def levenshtein_matrix_cache_clear():
    """Clear internal Levenshtein matrix cache.
    You want to do this between different input file pairs to decrease memory
    usage by not caching results from prior input files.
    """
    _levenshtein_matrix.cache_clear()
@multimethod
 def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings
-    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
+    Note that this is different from levenshtein() as this function knows about Unicode
-    clusters. This should be the correct way to compare two Unicode strings.
+    normalization and grapheme clusters. This should be the correct way to compare two
    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
@ -92,47 +32,6 @@ def distance(s1: ExtractedText, s2: ExtractedText):
    return distance(s1.text, s2.text)
 def seq_editops(seq1, seq2):
    """
    Return sequence of edit operations transforming one sequence to another.
    This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
    sequences.
    """
    seq1 = list(seq1)
    seq2 = list(seq2)
    m = len(seq1)
    n = len(seq2)
    D = levenshtein_matrix(seq1, seq2)
    def _tail_backtrace(i, j, accumulator):
        if i > 0 and D[i - 1, j] + 1 == D[i, j]:
            return partial(
                _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
            )
        if j > 0 and D[i, j - 1] + 1 == D[i, j]:
            return partial(
                _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
            )
        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
            return partial(
                _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
            )
        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP
        return accumulator
    def backtrace(i, j):
        result = partial(_tail_backtrace, i, j, [])
        while isinstance(result, partial):
            result = result()
        return result
    b = backtrace(m, n)
    return b
 def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.
@ -141,4 +40,4 @@ def editops(word1, word2):
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
-    return seq_editops(word1, word2)
+    return levenshtein_editops(word1, word2)
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
 from pkg_resources import resource_string
 from .cli import process as cli_process
 from .edit_distance import levenshtein_matrix_cache_clear
 OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
                    local_filename=report_prefix + report_suffix,
                )
            # Clear cache between files
            levenshtein_matrix_cache_clear()
 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,35 +1,38 @@
 import unicodedata
-from .. import seq_editops, editops
+from rapidfuzz.string_metric import levenshtein_editops
 from .. import editops
 # TODO: Remove rapidfuzz.string_metric.levenshtein_editops tests eventually
 def test_trivial():
-    assert seq_editops("abc", "abc") == []
+    assert levenshtein_editops("abc", "abc") == []
-    assert seq_editops("", "") == []
+    assert levenshtein_editops("", "") == []
 def test_insert():
-    assert seq_editops("bc", "abc") == [("insert", 0, 0)]
+    assert levenshtein_editops("bc", "abc") == [("insert", 0, 0)]
-    assert seq_editops("ac", "abc") == [("insert", 1, 1)]
+    assert levenshtein_editops("ac", "abc") == [("insert", 1, 1)]
-    assert seq_editops("ab", "abc") == [("insert", 2, 2)]
+    assert levenshtein_editops("ab", "abc") == [("insert", 2, 2)]
-    assert seq_editops("", "a") == [("insert", 0, 0)]
+    assert levenshtein_editops("", "a") == [("insert", 0, 0)]
 def test_multiple():
-    assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
+    assert levenshtein_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
 def test_delete():
-    assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
+    assert levenshtein_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
-    assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
+    assert levenshtein_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
-    assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
+    assert levenshtein_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
-    assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
+    assert levenshtein_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
-    assert seq_editops("Foo", "") == [
+    assert levenshtein_editops("Foo", "") == [
        ("delete", 0, 0),
        ("delete", 1, 0),
        ("delete", 2, 0),
    ]
-    assert seq_editops("Foolish", "Foo") == [
+    assert levenshtein_editops("Foolish", "Foo") == [
        ("delete", 3, 3),
        ("delete", 4, 3),
        ("delete", 5, 3),
@ -38,10 +41,10 @@ def test_delete():
 def test_ambiguous():
-    assert seq_editops("bcd", "abcef") == [
+    assert levenshtein_editops("bcd", "abcef") == [
        ("insert", 0, 0),
-        ("replace", 2, 3),
+        ("insert", 2, 3),
-        ("insert", 3, 4),
+        ("replace", 2, 4),
    ]
--- a/requirements.txt
+++ b/requirements.txt
@ -9,3 +9,4 @@ ocrd >= 2.20.1
 attrs
 multimethod == 1.3  # latest version to officially support Python 3.5
 tqdm
 rapidfuzz >= 1.8.1