Switch between c and own implementation for distance and editops.

pull/48/head
Benjamin Rosemann 4 years ago
parent 11916c2dcf
commit 0e263cfac2

@ -12,7 +12,7 @@ def seq_align(s1, s2):
"""Align general sequences.""" """Align general sequences."""
s1 = list(s1) s1 = list(s1)
s2 = list(s2) s2 = list(s2)
ops = seq_editops(s1, s2) ops = editops(s1, s2)
i = 0 i = 0
j = 0 j = 0

@ -2,9 +2,11 @@ from __future__ import division, print_function
import unicodedata import unicodedata
from functools import partial, lru_cache from functools import partial, lru_cache
from itertools import chain
from typing import Sequence, Tuple, List from typing import Sequence, Tuple, List
import numpy as np import numpy as np
from Levenshtein import editops as c_editops, distance as c_distance
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm from tqdm import tqdm
@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear():
def distance(s1: str, s2: str): def distance(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings """Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme Note that this is different from levenshtein() as this function knows about Unicode
clusters. This should be the correct way to compare two Unicode strings. normalization and grapheme clusters.
This should be the correct way to compare two Unicode strings.
""" """
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return distance(seq1, seq2) if any(len(s) > 1 for s in chain(seq1, seq2)):
return distance(seq1, seq2)
else:
return distance_fast("".join(seq1), "".join(seq2))
@multimethod @multimethod
@ -97,12 +104,24 @@ def distance(s1: List, s2: List):
return levenshtein(s1, s2) return levenshtein(s1, s2)
def seq_editops(seq1, seq2): def distance_fast(s1: str, s2: str):
"""Compute the Levenshtein edit distance between two Unicode strings
Also see `distance()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
return c_distance(s1, s2)
@multimethod
def editops(seq1: List, seq2: List):
""" """
Return sequence of edit operations transforming one sequence to another. Return sequence of edit operations transforming one sequence to another.
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary This aims to return the same/similar results as python-Levenshtein's editops(),
sequences. just generalized to arbitrary sequences.
""" """
seq1 = list(seq1) seq1 = list(seq1)
seq2 = list(seq2) seq2 = list(seq2)
@ -138,12 +157,27 @@ def seq_editops(seq1, seq2):
return b return b
def editops(word1, word2): @multimethod
def editops(s1: str, s2: str):
""" """
Return sequence of edit operations transforming one string to another. Return sequence of edit operations transforming one string to another.
Note that this returns indices to the _grapheme clusters_, not characters! Note that this returns indices to the _grapheme clusters_, not characters!
""" """
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1))) s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2))) s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return seq_editops(word1, word2) if any(len(s) > 1 for s in chain(s1, s2)):
return editops(s1, s2)
else:
return editops_fast("".join(s1), "".join(s2))
def editops_fast(s1: str, s2: str):
"""Return sequence of edit operations transforming one string to another.
Also see `editops()`.
The difference is that this implementation does not care about grapheme clusters or
unicode normalization, assuming that this already has been done in preprocessing.
"""
return c_editops(s1, s2)

@ -1,13 +1,11 @@
from __future__ import division, print_function
import unicodedata import unicodedata
import pytest import pytest
from .. import levenshtein, distance from .. import distance, distance_fast
TEST_PARAMS = "seq1,seq2,expected_dist" TEST_PARAMS = "s1,s2,expected_dist"
TEST_STRINGS = [ TEST_STRINGS = [
("a", "a", 0), ("a", "a", 0),
@ -24,30 +22,45 @@ TEST_STRINGS = [
TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)] TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]
TEST_UNICODE = [
# Different, decomposed!
(unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0),
# Same decomposition
(
# This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
"Schlyñ",
# This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
"Schlym̃",
1,
),
]
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES]) @pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
def test_distance_sequences(seq1, seq2, expected_dist): def test_distance_sequences(s1, s2, expected_dist):
dist = distance(seq1, seq2) dist = distance(s1, s2)
assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance_strings(s1, s2, expected_dist):
dist = distance(s1, s2)
assert dist == expected_dist assert dist == expected_dist
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) @pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
def test_distance(seq1, seq2, expected_dist): def test_distance_fast(s1, s2, expected_dist):
dist = distance(seq1, seq2) dist = distance_fast(s1, s2)
assert dist == expected_dist assert dist == expected_dist
def test_distance_unicode_wide(): @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
word1 = unicodedata.normalize("NFC", "Schlyñ") def test_editops_fast_unicode(s1, s2, expected_dist):
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! dist = distance_fast(s1, s2)
assert distance(word1, word2) == 0 assert dist != expected_dist
word1 = "Schlyñ"
assert ( @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
len(word1) == 6 def test_distance_unicode(s1, s2, expected_dist):
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points dist = distance(s1, s2)
word2 = "Schlym̃" assert dist == expected_dist
assert (
len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1

@ -1,63 +1,86 @@
import unicodedata import unicodedata
from .. import seq_editops, editops import pytest
from .. import editops, editops_fast
def test_trivial():
assert seq_editops("abc", "abc") == [] TEST_PARAMS = "s1,s2,expected_ops"
assert seq_editops("", "") == []
TEST_STRINGS = [
# trivial
("abc", "abc", []),
("", "", []),
# insert
("bc", "abc", [("insert", 0, 0)]),
("ac", "abc", [("insert", 1, 1)]),
("ab", "abc", [("insert", 2, 2)]),
("", "a", [("insert", 0, 0)]),
# delete
("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
(
"Foolish",
"Foo",
[("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
),
# multiple
("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
# ambiguous
("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
]
TEST_SEQUENCES = [
(["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
(["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
]
TEST_UNICODE = [
# In these cases, one of the words has a composed form, the other one does not.
("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
("oͤde", "öde", [("replace", 0, 0)]),
# equal
(
unicodedata.lookup("LATIN SMALL LETTER N")
+ unicodedata.lookup("COMBINING TILDE"),
unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
[],
),
]
def test_insert(): @pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
assert seq_editops("bc", "abc") == [("insert", 0, 0)] def test_editops_strings(s1, s2, expected_ops):
assert seq_editops("ac", "abc") == [("insert", 1, 1)] ops = editops(s1, s2)
assert seq_editops("ab", "abc") == [("insert", 2, 2)] assert ops == expected_ops
assert seq_editops("", "a") == [("insert", 0, 0)]
def test_multiple(): @pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)] def test_editops_sequences(s1, s2, expected_ops):
ops = editops(s1, s2)
assert ops == expected_ops
def test_delete(): @pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)] def test_editops_fast(s1, s2, expected_ops):
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)] ops = editops_fast(s1, s2)
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)] assert ops == expected_ops
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
assert seq_editops("Foo", "") == [
("delete", 0, 0),
("delete", 1, 0),
("delete", 2, 0),
]
assert seq_editops("Foolish", "Foo") == [
("delete", 3, 3),
("delete", 4, 3),
("delete", 5, 3),
("delete", 6, 3),
]
def test_ambiguous(): @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
assert seq_editops("bcd", "abcef") == [ def test_editops_fast_unicode(s1, s2, expected_ops):
("insert", 0, 0), ops = editops_fast(s1, s2)
("replace", 2, 3), assert ops != expected_ops
("insert", 3, 4),
]
def test_editops(): @pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
def test_editops_unicode(s1, s2, expected_ops):
"""Test editops() in cases where dealing with grapheme clusters matters""" """Test editops() in cases where dealing with grapheme clusters matters"""
# In these cases, one of the words has a composed form, the other one does not. if not expected_ops:
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)] assert s1 != s2
assert editops("oͤde", "öde") == [("replace", 0, 0)] assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
ops = editops(s1, s2)
assert ops == expected_ops
def test_editops_canonically_equivalent():
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
"COMBINING TILDE"
)
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
assert left != right
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
assert editops(left, right) == []

@ -6,7 +6,7 @@ from multimethod import multimethod
import uniseg.wordbreak import uniseg.wordbreak
from .edit_distance import levenshtein from .edit_distance import distance
from . import ExtractedText from . import ExtractedText
@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
reference_seq = list(reference) reference_seq = list(reference)
compared_seq = list(compared) compared_seq = list(compared)
d = levenshtein(reference_seq, compared_seq) d = distance(reference_seq, compared_seq)
n = len(reference_seq) n = len(reference_seq)
if d == 0: if d == 0:

Loading…
Cancel
Save