Switch from custom Levenshtein to python-Levenshtein

As the distance and editops calculation is a performance bottleneck in this application we substituted the custom Levenshtein implementation to the C implementation in the python-Levenshtein package. We now also have separate entrypoints for texts with unicode normalization and without because this also can be done more efficiently once upon preprocessing.
2025-08-24 00:49:55 +02:00 · 2020-11-16 12:06:44 +01:00 · 2020-11-16 12:06:44 +01:00 · e371da899e
commit e371da899e
parent 0e263cfac2
7 changed files with 98 additions and 1210 deletions
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -6,7 +6,7 @@ from typing import Tuple
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
-from .edit_distance import distance
+from .edit_distance import distance_unicode
 from .extracted_text import ExtractedText
@ -18,7 +18,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
    :return: character error rate and length of the reference
    """
-    d = distance(reference, compared)
+    d = distance_unicode(reference, compared)
    n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
    if d == 0:
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,113 +1,59 @@
 from __future__ import division, print_function
 import unicodedata
 from functools import partial, lru_cache
 from itertools import chain
-from typing import Sequence, Tuple, List
+from typing import List, Union, Tuple
 import numpy as np
 from Levenshtein import editops as c_editops, distance as c_distance
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from tqdm import tqdm
 from .extracted_text import ExtractedText
 from .config import Config
 def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
    """Compute the matrix commonly computed to produce the Levenshtein distance.
    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
    edit distance.
    This algorithm is implemented here because we need an implementation that can work with sequences other than
    strings, e.g. lists of grapheme clusters or lists of word strings.
    """
    # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
    # sequences to tuples to make them hashable.
    return _levenshtein_matrix(tuple(seq1), tuple(seq2))
@lru_cache(maxsize=10)
 def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
    """Compute the matrix commonly computed to produce the Levenshtein distance.
    This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
    """
    m = len(seq1)
    n = len(seq2)
    def from_to(start, stop):
        return range(start, stop + 1, 1)
    D = np.zeros((m + 1, n + 1), np.int)
    D[0, 0] = 0
    for i in from_to(1, m):
        D[i, 0] = i
    for j in from_to(1, n):
        D[0, j] = j
    for i in tqdm(from_to(1, m), disable=not Config.progress):
        for j in from_to(1, n):
            D[i, j] = min(
                D[i - 1, j - 1]
                + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
                D[i, j - 1] + 1,  # Insertion
                D[i - 1, j] + 1,  # Deletion
            )
    return D
 def levenshtein(seq1, seq2):
    """Compute the Levenshtein edit distance between two sequences"""
    m = len(seq1)
    n = len(seq2)
    D = levenshtein_matrix(seq1, seq2)
    return D[m, n]
 def levenshtein_matrix_cache_clear():
    """Clear internal Levenshtein matrix cache.
    You want to do this between different input file pairs to decrease memory
    usage by not caching results from prior input files.
    """
    _levenshtein_matrix.cache_clear()
@multimethod
-def distance(s1: str, s2: str):
+def distance_unicode(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings
-    Note that this is different from levenshtein() as this function knows about Unicode
+    Note that this is different from distance() as this function knows about Unicode
    normalization and grapheme clusters.
    This should be the correct way to compare two Unicode strings.
    """
-    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
+    s1, s2 = transform_unicode(s1, s2)
-    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
+    return distance(s1, s2)
    if any(len(s) > 1 for s in chain(seq1, seq2)):
        return distance(seq1, seq2)
    else:
        return distance_fast("".join(seq1), "".join(seq2))
@multimethod
-def distance(s1: ExtractedText, s2: ExtractedText):
+def distance_unicode(s1: ExtractedText, s2: ExtractedText):
    return distance(s1.text, s2.text)
@multimethod
 def distance(s1: List, s2: List):
    return levenshtein(s1, s2)
 def distance_fast(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings
-    Also see `distance()`.
+    Note that this is different from distance() as this function knows about Unicode
    normalization and grapheme clusters.
    This should be the correct way to compare two Unicode strings.
    """
    return distance_unicode(s1.text, s2.text)
@multimethod
 def distance(l1: List, l2: List):
    """Compute the Levenshtein edit distance between two lists.
    Also see `distance_unicode()`.
    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
    """
    s1, s2 = transform_lists(l1, l2)
    return c_distance(s1, s2)
@multimethod
 def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two strings.
    Also see `distance_unicode()`.
    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
@ -116,68 +62,75 @@ def distance_fast(s1: str, s2: str):
@multimethod
-def editops(seq1: List, seq2: List):
+def distance(s1: ExtractedText, s2: ExtractedText):
    """Compute the Levenshtein edit distance between two strings.
    Also see `distance_unicode()`.
    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
    """
-    Return sequence of edit operations transforming one sequence to another.
+    return distance(s1.text, s2.text)
-    This aims to return the same/similar results as python-Levenshtein's editops(),
+
-    just generalized to arbitrary sequences.
+@multimethod
 def editops_unicode(s1: str, s2: str):
    """Return sequence of edit operations transforming one string to another.
    Note that this returns indices to the _grapheme clusters_, not characters!
    """
-    seq1 = list(seq1)
+    s1, s2 = transform_unicode(s1, s2)
-    seq2 = list(seq2)
+    return editops(s1, s2)
    m = len(seq1)
    n = len(seq2)
    D = levenshtein_matrix(seq1, seq2)
    def _tail_backtrace(i, j, accumulator):
        if i > 0 and D[i - 1, j] + 1 == D[i, j]:
            return partial(
                _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
            )
        if j > 0 and D[i, j - 1] + 1 == D[i, j]:
            return partial(
                _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
            )
        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
            return partial(
                _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
            )
        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP
        return accumulator
-    def backtrace(i, j):
+@multimethod
-        result = partial(_tail_backtrace, i, j, [])
+def editops(l1: List, l2: List):
-        while isinstance(result, partial):
+    """Return sequence of edit operations transforming one list to another.
            result = result()
-        return result
+    Also see `editops_unicode()`.
-    b = backtrace(m, n)
+    The difference is that this implementation does not care about grapheme clusters or
-    return b
+    unicode normalization, assuming that this already has been done in preprocessing.
    """
    s1, s2 = transform_lists(l1, l2)
    return c_editops(s1, s2)
@multimethod
 def editops(s1: str, s2: str):
    """
    Return sequence of edit operations transforming one string to another.
    Note that this returns indices to the _grapheme clusters_, not characters!
    """
    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    if any(len(s) > 1 for s in chain(s1, s2)):
        return editops(s1, s2)
    else:
        return editops_fast("".join(s1), "".join(s2))
 def editops_fast(s1: str, s2: str):
    """Return sequence of edit operations transforming one string to another.
-    Also see `editops()`.
+    Also see `editops_unicode()`.
    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
    """
    return c_editops(s1, s2)
 def transform_lists(l1: List, l2: List) -> Tuple[str, str]:
    """Transform two lists into string representation.
    We need this transformation to be able to calculate a Levenshtein distance
    between two sequences.
    Note that we can only process 1,114,111 unique elements with this implementation.
    See https://docs.python.org/3/library/functions.html#chr
    """
    mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))}
    s1 = "".join([mapping[el] for el in l1])
    s2 = "".join([mapping[el] for el in l2])
    return s1, s2
 def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]:
    """Transform two text sequences to unicode representation.
    Normalize to unicode and decides whether we have wide chars
    that needs to be represented by lists.
    """
    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    if all(len(s) < 2 for s in chain(s1, s2)):
        s1, s2 = "".join(s1), "".join(s2)
    return s1, s2
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
 from pkg_resources import resource_string
 from .cli import process as cli_process
 from .edit_distance import levenshtein_matrix_cache_clear
 OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
                    local_filename=report_prefix + report_suffix,
                )
            # Clear cache between files
            levenshtein_matrix_cache_clear()
 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@ -2,7 +2,7 @@ import unicodedata
 import pytest
-from .. import distance, distance_fast
+from .. import distance, distance_unicode
 TEST_PARAMS = "s1,s2,expected_dist"
@ -42,25 +42,13 @@ def test_distance_sequences(s1, s2, expected_dist):
    assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
 def test_distance_strings(s1, s2, expected_dist):
    dist = distance(s1, s2)
    assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
 def test_distance_fast(s1, s2, expected_dist):
    dist = distance_fast(s1, s2)
    assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-def test_editops_fast_unicode(s1, s2, expected_dist):
+def test_distance_with_unicode(s1, s2, expected_dist):
-    dist = distance_fast(s1, s2)
+    dist = distance(s1, s2)
    assert dist != expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
 def test_distance_unicode(s1, s2, expected_dist):
-    dist = distance(s1, s2)
+    dist = distance_unicode(s1, s2)
    assert dist == expected_dist
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -2,7 +2,7 @@ import unicodedata
 import pytest
-from .. import editops, editops_fast
+from .. import editops, editops_unicode
 TEST_PARAMS = "s1,s2,expected_ops"
@ -51,36 +51,22 @@ TEST_UNICODE = [
 ]
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
 def test_editops_strings(s1, s2, expected_ops):
    ops = editops(s1, s2)
    assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
-def test_editops_sequences(s1, s2, expected_ops):
+def test_editops(s1, s2, expected_ops):
    ops = editops(s1, s2)
    assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
 def test_editops_fast(s1, s2, expected_ops):
    ops = editops_fast(s1, s2)
    assert ops == expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-def test_editops_fast_unicode(s1, s2, expected_ops):
+def test_editops_with_unicode(s1, s2, expected_ops):
-    ops = editops_fast(s1, s2)
+    ops = editops(s1, s2)
    assert ops != expected_ops
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
 def test_editops_unicode(s1, s2, expected_ops):
    """Test editops() in cases where dealing with grapheme clusters matters"""
    if not expected_ops:
        assert s1 != s2
        assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
-    ops = editops(s1, s2)
+    ops = editops_unicode(s1, s2)
    assert ops == expected_ops
--- a/requirements.txt
+++ b/requirements.txt
@ -9,3 +9,4 @@ ocrd >= 2.20.1
 attrs
 multimethod == 1.3  # latest version to officially support Python 3.5
 tqdm
 python-levenshtein