diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index c7e7733..68e45be 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -12,7 +12,7 @@ def seq_align(s1, s2): """Align general sequences.""" s1 = list(s1) s2 = list(s2) - ops = seq_editops(s1, s2) + ops = editops(s1, s2) i = 0 j = 0 diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index e6724f5..b906fa9 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -2,9 +2,11 @@ from __future__ import division, print_function import unicodedata from functools import partial, lru_cache +from itertools import chain from typing import Sequence, Tuple, List import numpy as np +from Levenshtein import editops as c_editops, distance as c_distance from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from tqdm import tqdm @@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear(): def distance(s1: str, s2: str): """Compute the Levenshtein edit distance between two Unicode strings - Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme - clusters. This should be the correct way to compare two Unicode strings. + Note that this is different from levenshtein() as this function knows about Unicode + normalization and grapheme clusters. + + This should be the correct way to compare two Unicode strings. """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - return distance(seq1, seq2) + if any(len(s) > 1 for s in chain(seq1, seq2)): + return distance(seq1, seq2) + else: + return distance_fast("".join(seq1), "".join(seq2)) @multimethod @@ -97,12 +104,24 @@ def distance(s1: List, s2: List): return levenshtein(s1, s2) -def seq_editops(seq1, seq2): +def distance_fast(s1: str, s2: str): + """Compute the Levenshtein edit distance between two Unicode strings + + Also see `distance()`. + + The difference is that this implementation does not care about grapheme clusters or + unicode normalization, assuming that this already has been done in preprocessing. + """ + return c_distance(s1, s2) + + +@multimethod +def editops(seq1: List, seq2: List): """ Return sequence of edit operations transforming one sequence to another. - This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary - sequences. + This aims to return the same/similar results as python-Levenshtein's editops(), + just generalized to arbitrary sequences. """ seq1 = list(seq1) seq2 = list(seq2) @@ -138,12 +157,27 @@ def seq_editops(seq1, seq2): return b -def editops(word1, word2): +@multimethod +def editops(s1: str, s2: str): """ Return sequence of edit operations transforming one string to another. Note that this returns indices to the _grapheme clusters_, not characters! """ - word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1))) - word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2))) - return seq_editops(word1, word2) + s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) + s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) + if any(len(s) > 1 for s in chain(s1, s2)): + return editops(s1, s2) + else: + return editops_fast("".join(s1), "".join(s2)) + + +def editops_fast(s1: str, s2: str): + """Return sequence of edit operations transforming one string to another. + + Also see `editops()`. + + The difference is that this implementation does not care about grapheme clusters or + unicode normalization, assuming that this already has been done in preprocessing. + """ + return c_editops(s1, s2) diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py index fb50061..ed31379 100644 --- a/qurator/dinglehopper/tests/test_edit_distance.py +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -1,13 +1,11 @@ -from __future__ import division, print_function - import unicodedata import pytest -from .. import levenshtein, distance +from .. import distance, distance_fast -TEST_PARAMS = "seq1,seq2,expected_dist" +TEST_PARAMS = "s1,s2,expected_dist" TEST_STRINGS = [ ("a", "a", 0), @@ -24,30 +22,45 @@ TEST_STRINGS = [ TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)] +TEST_UNICODE = [ + # Different, decomposed! + (unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0), + # Same decomposition + ( + # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points + "Schlyñ", + # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points + "Schlym̃", + 1, + ), +] + @pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES]) -def test_distance_sequences(seq1, seq2, expected_dist): - dist = distance(seq1, seq2) +def test_distance_sequences(s1, s2, expected_dist): + dist = distance(s1, s2) + assert dist == expected_dist + + +@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) +def test_distance_strings(s1, s2, expected_dist): + dist = distance(s1, s2) assert dist == expected_dist @pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) -def test_distance(seq1, seq2, expected_dist): - dist = distance(seq1, seq2) +def test_distance_fast(s1, s2, expected_dist): + dist = distance_fast(s1, s2) assert dist == expected_dist -def test_distance_unicode_wide(): - word1 = unicodedata.normalize("NFC", "Schlyñ") - word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! - assert distance(word1, word2) == 0 - - word1 = "Schlyñ" - assert ( - len(word1) == 6 - ) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points - word2 = "Schlym̃" - assert ( - len(word2) == 7 - ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points - assert distance(word1, word2) == 1 +@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) +def test_editops_fast_unicode(s1, s2, expected_dist): + dist = distance_fast(s1, s2) + assert dist != expected_dist + + +@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) +def test_distance_unicode(s1, s2, expected_dist): + dist = distance(s1, s2) + assert dist == expected_dist diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index 06afbfc..5a10db2 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -1,63 +1,86 @@ import unicodedata -from .. import seq_editops, editops - - -def test_trivial(): - assert seq_editops("abc", "abc") == [] - assert seq_editops("", "") == [] +import pytest + +from .. import editops, editops_fast + +TEST_PARAMS = "s1,s2,expected_ops" + +TEST_STRINGS = [ + # trivial + ("abc", "abc", []), + ("", "", []), + # insert + ("bc", "abc", [("insert", 0, 0)]), + ("ac", "abc", [("insert", 1, 1)]), + ("ab", "abc", [("insert", 2, 2)]), + ("", "a", [("insert", 0, 0)]), + # delete + ("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]), + ("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]), + ("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]), + ("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]), + ("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]), + ( + "Foolish", + "Foo", + [("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)], + ), + # multiple + ("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]), + # ambiguous + ("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]), +] + +TEST_SEQUENCES = [ + (["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]), + (["a", "ab"], ["a", "c"], [("replace", 1, 1)]), +] + +TEST_UNICODE = [ + # In these cases, one of the words has a composed form, the other one does not. + ("Schlyñ", "Schlym̃", [("replace", 5, 5)]), + ("oͤde", "öde", [("replace", 0, 0)]), + # equal + ( + unicodedata.lookup("LATIN SMALL LETTER N") + + unicodedata.lookup("COMBINING TILDE"), + unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"), + [], + ), +] -def test_insert(): - assert seq_editops("bc", "abc") == [("insert", 0, 0)] - assert seq_editops("ac", "abc") == [("insert", 1, 1)] - assert seq_editops("ab", "abc") == [("insert", 2, 2)] - assert seq_editops("", "a") == [("insert", 0, 0)] +@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) +def test_editops_strings(s1, s2, expected_ops): + ops = editops(s1, s2) + assert ops == expected_ops -def test_multiple(): - assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)] +@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES]) +def test_editops_sequences(s1, s2, expected_ops): + ops = editops(s1, s2) + assert ops == expected_ops -def test_delete(): - assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)] - assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)] - assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)] - assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)] - assert seq_editops("Foo", "") == [ - ("delete", 0, 0), - ("delete", 1, 0), - ("delete", 2, 0), - ] - assert seq_editops("Foolish", "Foo") == [ - ("delete", 3, 3), - ("delete", 4, 3), - ("delete", 5, 3), - ("delete", 6, 3), - ] +@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) +def test_editops_fast(s1, s2, expected_ops): + ops = editops_fast(s1, s2) + assert ops == expected_ops -def test_ambiguous(): - assert seq_editops("bcd", "abcef") == [ - ("insert", 0, 0), - ("replace", 2, 3), - ("insert", 3, 4), - ] +@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) +def test_editops_fast_unicode(s1, s2, expected_ops): + ops = editops_fast(s1, s2) + assert ops != expected_ops -def test_editops(): +@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE) +def test_editops_unicode(s1, s2, expected_ops): """Test editops() in cases where dealing with grapheme clusters matters""" - # In these cases, one of the words has a composed form, the other one does not. - assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)] - assert editops("oͤde", "öde") == [("replace", 0, 0)] - - -def test_editops_canonically_equivalent(): - left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup( - "COMBINING TILDE" - ) - right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE") - assert left != right - assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right) - assert editops(left, right) == [] + if not expected_ops: + assert s1 != s2 + assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2) + ops = editops(s1, s2) + assert ops == expected_ops diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index dde57b9..76707df 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -6,7 +6,7 @@ from multimethod import multimethod import uniseg.wordbreak -from .edit_distance import levenshtein +from .edit_distance import distance from . import ExtractedText @@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i reference_seq = list(reference) compared_seq = list(compared) - d = levenshtein(reference_seq, compared_seq) + d = distance(reference_seq, compared_seq) n = len(reference_seq) if d == 0: