Switch from custom Levenshtein to python-Levenshtein

As the distance and editops calculation is a performance bottleneck in this application we substituted the custom Levenshtein implementation to the C implementation in the python-Levenshtein package. We now also have separate entrypoints for texts with unicode normalization and without because this also can be done more efficiently once upon preprocessing.
2025-07-14 21:09:56 +02:00 · 2020-11-16 12:06:44 +01:00 · 2020-11-16 12:06:44 +01:00 · e371da899e
commit e371da899e
parent 0e263cfac2
7 changed files with 98 additions and 1210 deletions
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -6,7 +6,7 @@ from typing import Tuple
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters

-from .edit_distance import distance
+from .edit_distance import distance_unicode
 from .extracted_text import ExtractedText


@ -18,7 +18,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
    :return: character error rate and length of the reference
    """

-    d = distance(reference, compared)
+    d = distance_unicode(reference, compared)
    n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))

    if d == 0:
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,113 +1,59 @@
 from __future__ import division, print_function

 import unicodedata
-from functools import partial, lru_cache
 from itertools import chain
-from typing import Sequence, Tuple, List
+from typing import List, Union, Tuple

-import numpy as np
 from Levenshtein import editops as c_editops, distance as c_distance
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
-from tqdm import tqdm

 from .extracted_text import ExtractedText
-from .config import Config
-
-
-def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
-    """Compute the matrix commonly computed to produce the Levenshtein distance.
-    This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired
-    edit distance.
-
-    This algorithm is implemented here because we need an implementation that can work with sequences other than
-    strings, e.g. lists of grapheme clusters or lists of word strings.
-    """
-
-    # Internally, we use a cached version. As the cache only works on hashable parameters, we convert the input
-    # sequences to tuples to make them hashable.
-    return _levenshtein_matrix(tuple(seq1), tuple(seq2))
-
-
-@lru_cache(maxsize=10)
-def _levenshtein_matrix(seq1: Tuple, seq2: Tuple):
-    """Compute the matrix commonly computed to produce the Levenshtein distance.
-
-    This is a LRU cached function not meant to be used directly. Use levenshtein_matrix() instead.
-    """
-    m = len(seq1)
-    n = len(seq2)
-
-    def from_to(start, stop):
-        return range(start, stop + 1, 1)
-
-    D = np.zeros((m + 1, n + 1), np.int)
-    D[0, 0] = 0
-    for i in from_to(1, m):
-        D[i, 0] = i
-    for j in from_to(1, n):
-        D[0, j] = j
-    for i in tqdm(from_to(1, m), disable=not Config.progress):
-        for j in from_to(1, n):
-            D[i, j] = min(
-                D[i - 1, j - 1]
-                + 1 * (seq1[i - 1] != seq2[j - 1]),  # Same or Substitution
-                D[i, j - 1] + 1,  # Insertion
-                D[i - 1, j] + 1,  # Deletion
-            )
-
-    return D
-
-
-def levenshtein(seq1, seq2):
-    """Compute the Levenshtein edit distance between two sequences"""
-    m = len(seq1)
-    n = len(seq2)
-
-    D = levenshtein_matrix(seq1, seq2)
-    return D[m, n]
-
-
-def levenshtein_matrix_cache_clear():
-    """Clear internal Levenshtein matrix cache.
-
-    You want to do this between different input file pairs to decrease memory
-    usage by not caching results from prior input files.
-    """
-    _levenshtein_matrix.cache_clear()


@multimethod
-def distance(s1: str, s2: str):
+def distance_unicode(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

-    Note that this is different from levenshtein() as this function knows about Unicode
+    Note that this is different from distance() as this function knows about Unicode
    normalization and grapheme clusters.

    This should be the correct way to compare two Unicode strings.
    """
-    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
-    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
-    if any(len(s) > 1 for s in chain(seq1, seq2)):
-        return distance(seq1, seq2)
-    else:
-        return distance_fast("".join(seq1), "".join(seq2))
+    s1, s2 = transform_unicode(s1, s2)
+    return distance(s1, s2)


@multimethod
-def distance(s1: ExtractedText, s2: ExtractedText):
-    return distance(s1.text, s2.text)
-
-
-@multimethod
-def distance(s1: List, s2: List):
-    return levenshtein(s1, s2)
-
-
-def distance_fast(s1: str, s2: str):
+def distance_unicode(s1: ExtractedText, s2: ExtractedText):
    """Compute the Levenshtein edit distance between two Unicode strings

-    Also see `distance()`.
+    Note that this is different from distance() as this function knows about Unicode
+    normalization and grapheme clusters.
+
+    This should be the correct way to compare two Unicode strings.
+    """
+    return distance_unicode(s1.text, s2.text)
+
+
+@multimethod
+def distance(l1: List, l2: List):
+    """Compute the Levenshtein edit distance between two lists.
+
+    Also see `distance_unicode()`.
+
+    The difference is that this implementation does not care about grapheme clusters or
+    unicode normalization, assuming that this already has been done in preprocessing.
+    """
+    s1, s2 = transform_lists(l1, l2)
+    return c_distance(s1, s2)
+
+
+@multimethod
+def distance(s1: str, s2: str):
+    """Compute the Levenshtein edit distance between two strings.
+
+    Also see `distance_unicode()`.

    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
@ -116,68 +62,75 @@ def distance_fast(s1: str, s2: str):


@multimethod
-def editops(seq1: List, seq2: List):
+def distance(s1: ExtractedText, s2: ExtractedText):
+    """Compute the Levenshtein edit distance between two strings.
+
+    Also see `distance_unicode()`.
+
+    The difference is that this implementation does not care about grapheme clusters or
+    unicode normalization, assuming that this already has been done in preprocessing.
    """
-    Return sequence of edit operations transforming one sequence to another.
+    return distance(s1.text, s2.text)

-    This aims to return the same/similar results as python-Levenshtein's editops(),
-    just generalized to arbitrary sequences.
+
+@multimethod
+def editops_unicode(s1: str, s2: str):
+    """Return sequence of edit operations transforming one string to another.
+
+    Note that this returns indices to the _grapheme clusters_, not characters!
    """
-    seq1 = list(seq1)
-    seq2 = list(seq2)
-    m = len(seq1)
-    n = len(seq2)
-    D = levenshtein_matrix(seq1, seq2)
+    s1, s2 = transform_unicode(s1, s2)
+    return editops(s1, s2)

-    def _tail_backtrace(i, j, accumulator):
-        if i > 0 and D[i - 1, j] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i - 1, j, [("delete", i - 1, j)] + accumulator
-            )
-        if j > 0 and D[i, j - 1] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i, j - 1, [("insert", i, j - 1)] + accumulator
-            )
-        if i > 0 and j > 0 and D[i - 1, j - 1] + 1 == D[i, j]:
-            return partial(
-                _tail_backtrace, i - 1, j - 1, [("replace", i - 1, j - 1)] + accumulator
-            )
-        if i > 0 and j > 0 and D[i - 1, j - 1] == D[i, j]:
-            return partial(_tail_backtrace, i - 1, j - 1, accumulator)  # NOP
-        return accumulator

-    def backtrace(i, j):
-        result = partial(_tail_backtrace, i, j, [])
-        while isinstance(result, partial):
-            result = result()
+@multimethod
+def editops(l1: List, l2: List):
+    """Return sequence of edit operations transforming one list to another.

-        return result
+    Also see `editops_unicode()`.

-    b = backtrace(m, n)
-    return b
+    The difference is that this implementation does not care about grapheme clusters or
+    unicode normalization, assuming that this already has been done in preprocessing.
+    """
+    s1, s2 = transform_lists(l1, l2)
+    return c_editops(s1, s2)


@multimethod
 def editops(s1: str, s2: str):
-    """
-    Return sequence of edit operations transforming one string to another.
-
-    Note that this returns indices to the _grapheme clusters_, not characters!
-    """
-    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
-    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
-    if any(len(s) > 1 for s in chain(s1, s2)):
-        return editops(s1, s2)
-    else:
-        return editops_fast("".join(s1), "".join(s2))
-
-
-def editops_fast(s1: str, s2: str):
    """Return sequence of edit operations transforming one string to another.

-    Also see `editops()`.
+    Also see `editops_unicode()`.

    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
    """
    return c_editops(s1, s2)
+
+
+def transform_lists(l1: List, l2: List) -> Tuple[str, str]:
+    """Transform two lists into string representation.
+
+    We need this transformation to be able to calculate a Levenshtein distance
+    between two sequences.
+
+    Note that we can only process 1,114,111 unique elements with this implementation.
+    See https://docs.python.org/3/library/functions.html#chr
+    """
+    mapping = {el: chr(i) for i, el in enumerate(frozenset(chain(l1, l2)))}
+    s1 = "".join([mapping[el] for el in l1])
+    s2 = "".join([mapping[el] for el in l2])
+    return s1, s2
+
+
+def transform_unicode(s1: str, s2: str) -> Union[Tuple[str, str], Tuple[List[str]]]:
+    """Transform two text sequences to unicode representation.
+
+    Normalize to unicode and decides whether we have wide chars
+    that needs to be represented by lists.
+    """
+    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
+    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
+    if all(len(s) < 2 for s in chain(s1, s2)):
+        s1, s2 = "".join(s1), "".join(s2)
+    return s1, s2
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -8,7 +8,6 @@ from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
 from pkg_resources import resource_string

 from .cli import process as cli_process
-from .edit_distance import levenshtein_matrix_cache_clear

 OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))

@ -74,8 +73,6 @@ class OcrdDinglehopperEvaluate(Processor):
                    local_filename=report_prefix + report_suffix,
                )

-            # Clear cache between files
-            levenshtein_matrix_cache_clear()

 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@ -2,7 +2,7 @@ import unicodedata

 import pytest

-from .. import distance, distance_fast
+from .. import distance, distance_unicode


 TEST_PARAMS = "s1,s2,expected_dist"
@ -42,25 +42,13 @@ def test_distance_sequences(s1, s2, expected_dist):
    assert dist == expected_dist


-@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-def test_distance_strings(s1, s2, expected_dist):
-    dist = distance(s1, s2)
-    assert dist == expected_dist
-
-
-@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-def test_distance_fast(s1, s2, expected_dist):
-    dist = distance_fast(s1, s2)
-    assert dist == expected_dist
-
-
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-def test_editops_fast_unicode(s1, s2, expected_dist):
-    dist = distance_fast(s1, s2)
+def test_distance_with_unicode(s1, s2, expected_dist):
+    dist = distance(s1, s2)
    assert dist != expected_dist


@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
 def test_distance_unicode(s1, s2, expected_dist):
-    dist = distance(s1, s2)
+    dist = distance_unicode(s1, s2)
    assert dist == expected_dist
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -2,7 +2,7 @@ import unicodedata

 import pytest

-from .. import editops, editops_fast
+from .. import editops, editops_unicode

 TEST_PARAMS = "s1,s2,expected_ops"

@ -51,36 +51,22 @@ TEST_UNICODE = [
 ]


-@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-def test_editops_strings(s1, s2, expected_ops):
-    ops = editops(s1, s2)
-    assert ops == expected_ops
-
-
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
-def test_editops_sequences(s1, s2, expected_ops):
+def test_editops(s1, s2, expected_ops):
    ops = editops(s1, s2)
    assert ops == expected_ops


-@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-def test_editops_fast(s1, s2, expected_ops):
-    ops = editops_fast(s1, s2)
-    assert ops == expected_ops
-
-
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-def test_editops_fast_unicode(s1, s2, expected_ops):
-    ops = editops_fast(s1, s2)
+def test_editops_with_unicode(s1, s2, expected_ops):
+    ops = editops(s1, s2)
    assert ops != expected_ops


@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
 def test_editops_unicode(s1, s2, expected_ops):
-    """Test editops() in cases where dealing with grapheme clusters matters"""
-
    if not expected_ops:
        assert s1 != s2
        assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
-    ops = editops(s1, s2)
+    ops = editops_unicode(s1, s2)
    assert ops == expected_ops
--- a/requirements.txt
+++ b/requirements.txt
@ -9,3 +9,4 @@ ocrd >= 2.20.1
 attrs
 multimethod == 1.3  # latest version to officially support Python 3.5
 tqdm
+python-levenshtein