mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-08 11:20:26 +02:00
Switch between c and own implementation for distance and editops.
This commit is contained in:
parent
11916c2dcf
commit
0e263cfac2
5 changed files with 153 additions and 83 deletions
|
@ -12,7 +12,7 @@ def seq_align(s1, s2):
|
|||
"""Align general sequences."""
|
||||
s1 = list(s1)
|
||||
s2 = list(s2)
|
||||
ops = seq_editops(s1, s2)
|
||||
ops = editops(s1, s2)
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
|
|
|
@ -2,9 +2,11 @@ from __future__ import division, print_function
|
|||
|
||||
import unicodedata
|
||||
from functools import partial, lru_cache
|
||||
from itertools import chain
|
||||
from typing import Sequence, Tuple, List
|
||||
|
||||
import numpy as np
|
||||
from Levenshtein import editops as c_editops, distance as c_distance
|
||||
from multimethod import multimethod
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from tqdm import tqdm
|
||||
|
@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear():
|
|||
def distance(s1: str, s2: str):
|
||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||
|
||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
Note that this is different from levenshtein() as this function knows about Unicode
|
||||
normalization and grapheme clusters.
|
||||
|
||||
This should be the correct way to compare two Unicode strings.
|
||||
"""
|
||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||
return distance(seq1, seq2)
|
||||
if any(len(s) > 1 for s in chain(seq1, seq2)):
|
||||
return distance(seq1, seq2)
|
||||
else:
|
||||
return distance_fast("".join(seq1), "".join(seq2))
|
||||
|
||||
|
||||
@multimethod
|
||||
|
@ -97,12 +104,24 @@ def distance(s1: List, s2: List):
|
|||
return levenshtein(s1, s2)
|
||||
|
||||
|
||||
def seq_editops(seq1, seq2):
|
||||
def distance_fast(s1: str, s2: str):
|
||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||
|
||||
Also see `distance()`.
|
||||
|
||||
The difference is that this implementation does not care about grapheme clusters or
|
||||
unicode normalization, assuming that this already has been done in preprocessing.
|
||||
"""
|
||||
return c_distance(s1, s2)
|
||||
|
||||
|
||||
@multimethod
|
||||
def editops(seq1: List, seq2: List):
|
||||
"""
|
||||
Return sequence of edit operations transforming one sequence to another.
|
||||
|
||||
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
|
||||
sequences.
|
||||
This aims to return the same/similar results as python-Levenshtein's editops(),
|
||||
just generalized to arbitrary sequences.
|
||||
"""
|
||||
seq1 = list(seq1)
|
||||
seq2 = list(seq2)
|
||||
|
@ -138,12 +157,27 @@ def seq_editops(seq1, seq2):
|
|||
return b
|
||||
|
||||
|
||||
def editops(word1, word2):
|
||||
@multimethod
|
||||
def editops(s1: str, s2: str):
|
||||
"""
|
||||
Return sequence of edit operations transforming one string to another.
|
||||
|
||||
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||
"""
|
||||
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
||||
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
||||
return seq_editops(word1, word2)
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||
if any(len(s) > 1 for s in chain(s1, s2)):
|
||||
return editops(s1, s2)
|
||||
else:
|
||||
return editops_fast("".join(s1), "".join(s2))
|
||||
|
||||
|
||||
def editops_fast(s1: str, s2: str):
|
||||
"""Return sequence of edit operations transforming one string to another.
|
||||
|
||||
Also see `editops()`.
|
||||
|
||||
The difference is that this implementation does not care about grapheme clusters or
|
||||
unicode normalization, assuming that this already has been done in preprocessing.
|
||||
"""
|
||||
return c_editops(s1, s2)
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import unicodedata
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import levenshtein, distance
|
||||
from .. import distance, distance_fast
|
||||
|
||||
|
||||
TEST_PARAMS = "seq1,seq2,expected_dist"
|
||||
TEST_PARAMS = "s1,s2,expected_dist"
|
||||
|
||||
TEST_STRINGS = [
|
||||
("a", "a", 0),
|
||||
|
@ -24,30 +22,45 @@ TEST_STRINGS = [
|
|||
|
||||
TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]
|
||||
|
||||
TEST_UNICODE = [
|
||||
# Different, decomposed!
|
||||
(unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0),
|
||||
# Same decomposition
|
||||
(
|
||||
# This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
"Schlyñ",
|
||||
# This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
"Schlym̃",
|
||||
1,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||
def test_distance_sequences(seq1, seq2, expected_dist):
|
||||
dist = distance(seq1, seq2)
|
||||
def test_distance_sequences(s1, s2, expected_dist):
|
||||
dist = distance(s1, s2)
|
||||
assert dist == expected_dist
|
||||
|
||||
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||
def test_distance(seq1, seq2, expected_dist):
|
||||
dist = distance(seq1, seq2)
|
||||
def test_distance_strings(s1, s2, expected_dist):
|
||||
dist = distance(s1, s2)
|
||||
assert dist == expected_dist
|
||||
|
||||
|
||||
def test_distance_unicode_wide():
|
||||
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
||||
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
||||
assert distance(word1, word2) == 0
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||
def test_distance_fast(s1, s2, expected_dist):
|
||||
dist = distance_fast(s1, s2)
|
||||
assert dist == expected_dist
|
||||
|
||||
word1 = "Schlyñ"
|
||||
assert (
|
||||
len(word1) == 6
|
||||
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||
word2 = "Schlym̃"
|
||||
assert (
|
||||
len(word2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
assert distance(word1, word2) == 1
|
||||
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||
def test_editops_fast_unicode(s1, s2, expected_dist):
|
||||
dist = distance_fast(s1, s2)
|
||||
assert dist != expected_dist
|
||||
|
||||
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||
def test_distance_unicode(s1, s2, expected_dist):
|
||||
dist = distance(s1, s2)
|
||||
assert dist == expected_dist
|
||||
|
|
|
@ -1,63 +1,86 @@
|
|||
import unicodedata
|
||||
|
||||
from .. import seq_editops, editops
|
||||
import pytest
|
||||
|
||||
from .. import editops, editops_fast
|
||||
|
||||
TEST_PARAMS = "s1,s2,expected_ops"
|
||||
|
||||
TEST_STRINGS = [
|
||||
# trivial
|
||||
("abc", "abc", []),
|
||||
("", "", []),
|
||||
# insert
|
||||
("bc", "abc", [("insert", 0, 0)]),
|
||||
("ac", "abc", [("insert", 1, 1)]),
|
||||
("ab", "abc", [("insert", 2, 2)]),
|
||||
("", "a", [("insert", 0, 0)]),
|
||||
# delete
|
||||
("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
|
||||
("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
|
||||
("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
|
||||
("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
|
||||
("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
|
||||
(
|
||||
"Foolish",
|
||||
"Foo",
|
||||
[("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
|
||||
),
|
||||
# multiple
|
||||
("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
|
||||
# ambiguous
|
||||
("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
|
||||
]
|
||||
|
||||
TEST_SEQUENCES = [
|
||||
(["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
|
||||
(["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
|
||||
]
|
||||
|
||||
TEST_UNICODE = [
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
|
||||
("oͤde", "öde", [("replace", 0, 0)]),
|
||||
# equal
|
||||
(
|
||||
unicodedata.lookup("LATIN SMALL LETTER N")
|
||||
+ unicodedata.lookup("COMBINING TILDE"),
|
||||
unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
|
||||
[],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_trivial():
|
||||
assert seq_editops("abc", "abc") == []
|
||||
assert seq_editops("", "") == []
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||
def test_editops_strings(s1, s2, expected_ops):
|
||||
ops = editops(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
||||
|
||||
def test_insert():
|
||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||
def test_editops_sequences(s1, s2, expected_ops):
|
||||
ops = editops(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
||||
|
||||
def test_multiple():
|
||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||
def test_editops_fast(s1, s2, expected_ops):
|
||||
ops = editops_fast(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
||||
|
||||
def test_delete():
|
||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
||||
assert seq_editops("Foo", "") == [
|
||||
("delete", 0, 0),
|
||||
("delete", 1, 0),
|
||||
("delete", 2, 0),
|
||||
]
|
||||
assert seq_editops("Foolish", "Foo") == [
|
||||
("delete", 3, 3),
|
||||
("delete", 4, 3),
|
||||
("delete", 5, 3),
|
||||
("delete", 6, 3),
|
||||
]
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||
def test_editops_fast_unicode(s1, s2, expected_ops):
|
||||
ops = editops_fast(s1, s2)
|
||||
assert ops != expected_ops
|
||||
|
||||
|
||||
def test_ambiguous():
|
||||
assert seq_editops("bcd", "abcef") == [
|
||||
("insert", 0, 0),
|
||||
("replace", 2, 3),
|
||||
("insert", 3, 4),
|
||||
]
|
||||
|
||||
|
||||
def test_editops():
|
||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||
def test_editops_unicode(s1, s2, expected_ops):
|
||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||
|
||||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
|
||||
assert editops("oͤde", "öde") == [("replace", 0, 0)]
|
||||
|
||||
|
||||
def test_editops_canonically_equivalent():
|
||||
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
|
||||
"COMBINING TILDE"
|
||||
)
|
||||
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
|
||||
assert left != right
|
||||
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
|
||||
assert editops(left, right) == []
|
||||
if not expected_ops:
|
||||
assert s1 != s2
|
||||
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
|
||||
ops = editops(s1, s2)
|
||||
assert ops == expected_ops
|
||||
|
|
|
@ -6,7 +6,7 @@ from multimethod import multimethod
|
|||
|
||||
import uniseg.wordbreak
|
||||
|
||||
from .edit_distance import levenshtein
|
||||
from .edit_distance import distance
|
||||
from . import ExtractedText
|
||||
|
||||
|
||||
|
@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
|
|||
reference_seq = list(reference)
|
||||
compared_seq = list(compared)
|
||||
|
||||
d = levenshtein(reference_seq, compared_seq)
|
||||
d = distance(reference_seq, compared_seq)
|
||||
n = len(reference_seq)
|
||||
|
||||
if d == 0:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue