Switch between c and own implementation for distance and editops.

2025-07-12 03:49:57 +02:00 · 2020-11-16 09:48:54 +01:00 · 2020-11-16 09:48:54 +01:00 · 0e263cfac2
commit 0e263cfac2
parent 11916c2dcf
5 changed files with 153 additions and 83 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -12,7 +12,7 @@ def seq_align(s1, s2):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
-    ops = seq_editops(s1, s2)
+    ops = editops(s1, s2)
    i = 0
    j = 0
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -2,9 +2,11 @@ from __future__ import division, print_function
 import unicodedata
 from functools import partial, lru_cache
 from itertools import chain
 from typing import Sequence, Tuple, List
 import numpy as np
 from Levenshtein import editops as c_editops, distance as c_distance
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from tqdm import tqdm
@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear():
 def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings
-    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
+    Note that this is different from levenshtein() as this function knows about Unicode
-    clusters. This should be the correct way to compare two Unicode strings.
+    normalization and grapheme clusters.
    This should be the correct way to compare two Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
-    return distance(seq1, seq2)
+    if any(len(s) > 1 for s in chain(seq1, seq2)):
        return distance(seq1, seq2)
    else:
        return distance_fast("".join(seq1), "".join(seq2))
@multimethod
@ -97,12 +104,24 @@ def distance(s1: List, s2: List):
    return levenshtein(s1, s2)
-def seq_editops(seq1, seq2):
+def distance_fast(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings
    Also see `distance()`.
    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
    """
    return c_distance(s1, s2)
@multimethod
 def editops(seq1: List, seq2: List):
    """
    Return sequence of edit operations transforming one sequence to another.
-    This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
+    This aims to return the same/similar results as python-Levenshtein's editops(),
-    sequences.
+    just generalized to arbitrary sequences.
    """
    seq1 = list(seq1)
    seq2 = list(seq2)
@ -138,12 +157,27 @@ def seq_editops(seq1, seq2):
    return b
-def editops(word1, word2):
+@multimethod
 def editops(s1: str, s2: str):
    """
    Return sequence of edit operations transforming one string to another.
    Note that this returns indices to the _grapheme clusters_, not characters!
    """
-    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
+    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
-    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
+    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
-    return seq_editops(word1, word2)
+    if any(len(s) > 1 for s in chain(s1, s2)):
        return editops(s1, s2)
    else:
        return editops_fast("".join(s1), "".join(s2))
 def editops_fast(s1: str, s2: str):
    """Return sequence of edit operations transforming one string to another.
    Also see `editops()`.
    The difference is that this implementation does not care about grapheme clusters or
    unicode normalization, assuming that this already has been done in preprocessing.
    """
    return c_editops(s1, s2)
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
@ -1,13 +1,11 @@
 from __future__ import division, print_function
 import unicodedata
 import pytest
-from .. import levenshtein, distance
+from .. import distance, distance_fast
-TEST_PARAMS = "seq1,seq2,expected_dist"
+TEST_PARAMS = "s1,s2,expected_dist"
 TEST_STRINGS = [
    ("a", "a", 0),
@ -24,30 +22,45 @@ TEST_STRINGS = [
 TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]
 TEST_UNICODE = [
    # Different, decomposed!
    (unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0),
    # Same decomposition
    (
        # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
        "Schlyñ",
        # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
        "Schlym̃",
        1,
    ),
 ]
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
-def test_distance_sequences(seq1, seq2, expected_dist):
+def test_distance_sequences(s1, s2, expected_dist):
-    dist = distance(seq1, seq2)
+    dist = distance(s1, s2)
    assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-def test_distance(seq1, seq2, expected_dist):
+def test_distance_strings(s1, s2, expected_dist):
-    dist = distance(seq1, seq2)
+    dist = distance(s1, s2)
    assert dist == expected_dist
-def test_distance_unicode_wide():
+@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-    word1 = unicodedata.normalize("NFC", "Schlyñ")
+def test_distance_fast(s1, s2, expected_dist):
-    word2 = unicodedata.normalize("NFD", "Schlyñ")  # Different, decomposed!
+    dist = distance_fast(s1, s2)
-    assert distance(word1, word2) == 0
+    assert dist == expected_dist
-    word1 = "Schlyñ"
+
-    assert (
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-        len(word1) == 6
+def test_editops_fast_unicode(s1, s2, expected_dist):
-    )  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
+    dist = distance_fast(s1, s2)
-    word2 = "Schlym̃"
+    assert dist != expected_dist
-    assert (
+
-        len(word2) == 7
+
-    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-    assert distance(word1, word2) == 1
+def test_distance_unicode(s1, s2, expected_dist):
    dist = distance(s1, s2)
    assert dist == expected_dist
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,63 +1,86 @@
 import unicodedata
-from .. import seq_editops, editops
+import pytest
 from .. import editops, editops_fast
 TEST_PARAMS = "s1,s2,expected_ops"
 TEST_STRINGS = [
    # trivial
    ("abc", "abc", []),
    ("", "", []),
    # insert
    ("bc", "abc", [("insert", 0, 0)]),
    ("ac", "abc", [("insert", 1, 1)]),
    ("ab", "abc", [("insert", 2, 2)]),
    ("", "a", [("insert", 0, 0)]),
    # delete
    ("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
    ("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
    ("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
    ("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
    ("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
    (
        "Foolish",
        "Foo",
        [("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
    ),
    # multiple
    ("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
    # ambiguous
    ("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
 ]
 TEST_SEQUENCES = [
    (["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
    (["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
 ]
 TEST_UNICODE = [
    # In these cases, one of the words has a composed form, the other one does not.
    ("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
    ("oͤde", "öde", [("replace", 0, 0)]),
    # equal
    (
        unicodedata.lookup("LATIN SMALL LETTER N")
        + unicodedata.lookup("COMBINING TILDE"),
        unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
        [],
    ),
 ]
-def test_trivial():
+@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-    assert seq_editops("abc", "abc") == []
+def test_editops_strings(s1, s2, expected_ops):
-    assert seq_editops("", "") == []
+    ops = editops(s1, s2)
    assert ops == expected_ops
-def test_insert():
+@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
-    assert seq_editops("bc", "abc") == [("insert", 0, 0)]
+def test_editops_sequences(s1, s2, expected_ops):
-    assert seq_editops("ac", "abc") == [("insert", 1, 1)]
+    ops = editops(s1, s2)
-    assert seq_editops("ab", "abc") == [("insert", 2, 2)]
+    assert ops == expected_ops
    assert seq_editops("", "a") == [("insert", 0, 0)]
-def test_multiple():
+@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
-    assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
+def test_editops_fast(s1, s2, expected_ops):
    ops = editops_fast(s1, s2)
    assert ops == expected_ops
-def test_delete():
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-    assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
+def test_editops_fast_unicode(s1, s2, expected_ops):
-    assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
+    ops = editops_fast(s1, s2)
-    assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
+    assert ops != expected_ops
    assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
    assert seq_editops("Foo", "") == [
        ("delete", 0, 0),
        ("delete", 1, 0),
        ("delete", 2, 0),
    ]
    assert seq_editops("Foolish", "Foo") == [
        ("delete", 3, 3),
        ("delete", 4, 3),
        ("delete", 5, 3),
        ("delete", 6, 3),
    ]
-def test_ambiguous():
+@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
-    assert seq_editops("bcd", "abcef") == [
+def test_editops_unicode(s1, s2, expected_ops):
        ("insert", 0, 0),
        ("replace", 2, 3),
        ("insert", 3, 4),
    ]
 def test_editops():
    """Test editops() in cases where dealing with grapheme clusters matters"""
-    # In these cases, one of the words has a composed form, the other one does not.
+    if not expected_ops:
-    assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
+        assert s1 != s2
-    assert editops("oͤde", "öde") == [("replace", 0, 0)]
+        assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
-
+    ops = editops(s1, s2)
-
+    assert ops == expected_ops
 def test_editops_canonically_equivalent():
    left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
        "COMBINING TILDE"
    )
    right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
    assert left != right
    assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
    assert editops(left, right) == []
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -6,7 +6,7 @@ from multimethod import multimethod
 import uniseg.wordbreak
-from .edit_distance import levenshtein
+from .edit_distance import distance
 from . import ExtractedText
@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
    reference_seq = list(reference)
    compared_seq = list(compared)
-    d = levenshtein(reference_seq, compared_seq)
+    d = distance(reference_seq, compared_seq)
    n = len(reference_seq)
    if d == 0: