Switch between c and own implementation for distance and editops.

2026-03-17 04:31:58 +01:00 · 2020-11-16 09:48:54 +01:00 · 2020-11-16 09:48:54 +01:00 · 0e263cfac2
commit 0e263cfac2
parent 11916c2dcf
5 changed files with 153 additions and 83 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -12,7 +12,7 @@ def seq_align(s1, s2):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
-    ops = seq_editops(s1, s2)
+    ops = editops(s1, s2)
    i = 0
    j = 0

--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -2,9 +2,11 @@ from __future__ import division, print_function

 import unicodedata
 from functools import partial, lru_cache
+from itertools import chain
 from typing import Sequence, Tuple, List

 import numpy as np
+from Levenshtein import editops as c_editops, distance as c_distance
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from tqdm import tqdm
@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear():
 def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

-    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
-    clusters. This should be the correct way to compare two Unicode strings.
+    Note that this is different from levenshtein() as this function knows about Unicode
+    normalization and grapheme clusters.
+
+    This should be the correct way to compare two Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
+    if any(len(s) > 1 for s in chain(seq1, seq2)):
        return distance(seq1, seq2)
+    else:
+        return distance_fast("".join(seq1), "".join(seq2))


@multimethod
@ -97,12 +104,24 @@ def distance(s1: List, s2: List):
    return levenshtein(s1, s2)


-def seq_editops(seq1, seq2):
+def distance_fast(s1: str, s2: str):
+    """Compute the Levenshtein edit distance between two Unicode strings
+
+    Also see `distance()`.
+
+    The difference is that this implementation does not care about grapheme clusters or
+    unicode normalization, assuming that this already has been done in preprocessing.
+    """
+    return c_distance(s1, s2)
+
+
+@multimethod
+def editops(seq1: List, seq2: List):
    """
    Return sequence of edit operations transforming one sequence to another.

-    This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
-    sequences.
+    This aims to return the same/similar results as python-Levenshtein's editops(),
+    just generalized to arbitrary sequences.
    """
    seq1 = list(seq1)
    seq2 = list(seq2)
@ -138,12 +157,27 @@ def seq_editops(seq1, seq2):
    return b


-def editops(word1, word2):
+@multimethod
+def editops(s1: str, s2: str):
    """
    Return sequence of edit operations transforming one string to another.

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
-    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
-    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
-    return seq_editops(word1, word2)
+    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
+    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
+    if any(len(s) > 1 for s in chain(s1, s2)):
+        return editops(s1, s2)
+    else:
+        return editops_fast("".join(s1), "".join(s2))
+
+
+def editops_fast(s1: str, s2: str):
+    """Return sequence of edit operations transforming one string to another.
+
+    Also see `editops()`.
+
+    The difference is that this implementation does not care about grapheme clusters or
+    unicode normalization, assuming that this already has been done in preprocessing.
+    """
+    return c_editops(s1, s2)
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@ -1,13 +1,11 @@
-from __future__ import division, print_function
-
 import unicodedata

 import pytest

-from .. import levenshtein, distance
+from .. import distance, distance_fast


-TEST_PARAMS = "seq1,seq2,expected_dist"
+TEST_PARAMS = "s1,s2,expected_dist"

 TEST_STRINGS = [
    ("a", "a", 0),
@ -24,30 +22,45 @@ TEST_STRINGS = [

 TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]

+TEST_UNICODE = [
+    # Different, decomposed!
+    (unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0),
+    # Same decomposition
+    (
+        # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
+        "Schlyñ",
+        # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
+        "Schlym̃",
+        1,
+    ),
+]
+

@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
-def test_distance_sequences(seq1, seq2, expected_dist):
-    dist = distance(seq1, seq2)
+def test_distance_sequences(s1, s2, expected_dist):
+    dist = distance(s1, s2)
    assert dist == expected_dist


@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-def test_distance(seq1, seq2, expected_dist):
-    dist = distance(seq1, seq2)
+def test_distance_strings(s1, s2, expected_dist):
+    dist = distance(s1, s2)
    assert dist == expected_dist


-def test_distance_unicode_wide():
-    word1 = unicodedata.normalize("NFC", "Schlyñ")
-    word2 = unicodedata.normalize("NFD", "Schlyñ")  # Different, decomposed!
-    assert distance(word1, word2) == 0
+@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
+def test_distance_fast(s1, s2, expected_dist):
+    dist = distance_fast(s1, s2)
+    assert dist == expected_dist

-    word1 = "Schlyñ"
-    assert (
-        len(word1) == 6
-    )  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
-    word2 = "Schlym̃"
-    assert (
-        len(word2) == 7
-    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
-    assert distance(word1, word2) == 1
+
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
+def test_editops_fast_unicode(s1, s2, expected_dist):
+    dist = distance_fast(s1, s2)
+    assert dist != expected_dist
+
+
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
+def test_distance_unicode(s1, s2, expected_dist):
+    dist = distance(s1, s2)
+    assert dist == expected_dist
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,63 +1,86 @@
 import unicodedata

-from .. import seq_editops, editops
+import pytest
+
+from .. import editops, editops_fast
+
+TEST_PARAMS = "s1,s2,expected_ops"
+
+TEST_STRINGS = [
+    # trivial
+    ("abc", "abc", []),
+    ("", "", []),
+    # insert
+    ("bc", "abc", [("insert", 0, 0)]),
+    ("ac", "abc", [("insert", 1, 1)]),
+    ("ab", "abc", [("insert", 2, 2)]),
+    ("", "a", [("insert", 0, 0)]),
+    # delete
+    ("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
+    ("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
+    ("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
+    ("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
+    ("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
+    (
+        "Foolish",
+        "Foo",
+        [("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
+    ),
+    # multiple
+    ("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
+    # ambiguous
+    ("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
+]
+
+TEST_SEQUENCES = [
+    (["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
+    (["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
+]
+
+TEST_UNICODE = [
+    # In these cases, one of the words has a composed form, the other one does not.
+    ("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
+    ("oͤde", "öde", [("replace", 0, 0)]),
+    # equal
+    (
+        unicodedata.lookup("LATIN SMALL LETTER N")
+        + unicodedata.lookup("COMBINING TILDE"),
+        unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
+        [],
+    ),
+]


-def test_trivial():
-    assert seq_editops("abc", "abc") == []
-    assert seq_editops("", "") == []
+@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
+def test_editops_strings(s1, s2, expected_ops):
+    ops = editops(s1, s2)
+    assert ops == expected_ops


-def test_insert():
-    assert seq_editops("bc", "abc") == [("insert", 0, 0)]
-    assert seq_editops("ac", "abc") == [("insert", 1, 1)]
-    assert seq_editops("ab", "abc") == [("insert", 2, 2)]
-    assert seq_editops("", "a") == [("insert", 0, 0)]
+@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
+def test_editops_sequences(s1, s2, expected_ops):
+    ops = editops(s1, s2)
+    assert ops == expected_ops


-def test_multiple():
-    assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
+@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
+def test_editops_fast(s1, s2, expected_ops):
+    ops = editops_fast(s1, s2)
+    assert ops == expected_ops


-def test_delete():
-    assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
-    assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
-    assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
-    assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
-    assert seq_editops("Foo", "") == [
-        ("delete", 0, 0),
-        ("delete", 1, 0),
-        ("delete", 2, 0),
-    ]
-    assert seq_editops("Foolish", "Foo") == [
-        ("delete", 3, 3),
-        ("delete", 4, 3),
-        ("delete", 5, 3),
-        ("delete", 6, 3),
-    ]
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
+def test_editops_fast_unicode(s1, s2, expected_ops):
+    ops = editops_fast(s1, s2)
+    assert ops != expected_ops


-def test_ambiguous():
-    assert seq_editops("bcd", "abcef") == [
-        ("insert", 0, 0),
-        ("replace", 2, 3),
-        ("insert", 3, 4),
-    ]
-
-
-def test_editops():
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
+def test_editops_unicode(s1, s2, expected_ops):
    """Test editops() in cases where dealing with grapheme clusters matters"""

-    # In these cases, one of the words has a composed form, the other one does not.
-    assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
-    assert editops("oͤde", "öde") == [("replace", 0, 0)]
-
-
-def test_editops_canonically_equivalent():
-    left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
-        "COMBINING TILDE"
-    )
-    right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
-    assert left != right
-    assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
-    assert editops(left, right) == []
+    if not expected_ops:
+        assert s1 != s2
+        assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
+    ops = editops(s1, s2)
+    assert ops == expected_ops
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -6,7 +6,7 @@ from multimethod import multimethod

 import uniseg.wordbreak

-from .edit_distance import levenshtein
+from .edit_distance import distance
 from . import ExtractedText


@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
    reference_seq = list(reference)
    compared_seq = list(compared)

-    d = levenshtein(reference_seq, compared_seq)
+    d = distance(reference_seq, compared_seq)
    n = len(reference_seq)

    if d == 0: