mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
Switch between c and own implementation for distance and editops.
This commit is contained in:
parent
11916c2dcf
commit
0e263cfac2
5 changed files with 153 additions and 83 deletions
|
@ -12,7 +12,7 @@ def seq_align(s1, s2):
|
||||||
"""Align general sequences."""
|
"""Align general sequences."""
|
||||||
s1 = list(s1)
|
s1 = list(s1)
|
||||||
s2 = list(s2)
|
s2 = list(s2)
|
||||||
ops = seq_editops(s1, s2)
|
ops = editops(s1, s2)
|
||||||
i = 0
|
i = 0
|
||||||
j = 0
|
j = 0
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,11 @@ from __future__ import division, print_function
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from functools import partial, lru_cache
|
from functools import partial, lru_cache
|
||||||
|
from itertools import chain
|
||||||
from typing import Sequence, Tuple, List
|
from typing import Sequence, Tuple, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from Levenshtein import editops as c_editops, distance as c_distance
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -79,12 +81,17 @@ def levenshtein_matrix_cache_clear():
|
||||||
def distance(s1: str, s2: str):
|
def distance(s1: str, s2: str):
|
||||||
"""Compute the Levenshtein edit distance between two Unicode strings
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
Note that this is different from levenshtein() as this function knows about Unicode
|
||||||
clusters. This should be the correct way to compare two Unicode strings.
|
normalization and grapheme clusters.
|
||||||
|
|
||||||
|
This should be the correct way to compare two Unicode strings.
|
||||||
"""
|
"""
|
||||||
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||||
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||||
|
if any(len(s) > 1 for s in chain(seq1, seq2)):
|
||||||
return distance(seq1, seq2)
|
return distance(seq1, seq2)
|
||||||
|
else:
|
||||||
|
return distance_fast("".join(seq1), "".join(seq2))
|
||||||
|
|
||||||
|
|
||||||
@multimethod
|
@multimethod
|
||||||
|
@ -97,12 +104,24 @@ def distance(s1: List, s2: List):
|
||||||
return levenshtein(s1, s2)
|
return levenshtein(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
def seq_editops(seq1, seq2):
|
def distance_fast(s1: str, s2: str):
|
||||||
|
"""Compute the Levenshtein edit distance between two Unicode strings
|
||||||
|
|
||||||
|
Also see `distance()`.
|
||||||
|
|
||||||
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
|
"""
|
||||||
|
return c_distance(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
@multimethod
|
||||||
|
def editops(seq1: List, seq2: List):
|
||||||
"""
|
"""
|
||||||
Return sequence of edit operations transforming one sequence to another.
|
Return sequence of edit operations transforming one sequence to another.
|
||||||
|
|
||||||
This aims to return the same/similar results as python-Levenshtein's editops(), just generalized to arbitrary
|
This aims to return the same/similar results as python-Levenshtein's editops(),
|
||||||
sequences.
|
just generalized to arbitrary sequences.
|
||||||
"""
|
"""
|
||||||
seq1 = list(seq1)
|
seq1 = list(seq1)
|
||||||
seq2 = list(seq2)
|
seq2 = list(seq2)
|
||||||
|
@ -138,12 +157,27 @@ def seq_editops(seq1, seq2):
|
||||||
return b
|
return b
|
||||||
|
|
||||||
|
|
||||||
def editops(word1, word2):
|
@multimethod
|
||||||
|
def editops(s1: str, s2: str):
|
||||||
"""
|
"""
|
||||||
Return sequence of edit operations transforming one string to another.
|
Return sequence of edit operations transforming one string to another.
|
||||||
|
|
||||||
Note that this returns indices to the _grapheme clusters_, not characters!
|
Note that this returns indices to the _grapheme clusters_, not characters!
|
||||||
"""
|
"""
|
||||||
word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
|
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
|
||||||
word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
|
s2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
|
||||||
return seq_editops(word1, word2)
|
if any(len(s) > 1 for s in chain(s1, s2)):
|
||||||
|
return editops(s1, s2)
|
||||||
|
else:
|
||||||
|
return editops_fast("".join(s1), "".join(s2))
|
||||||
|
|
||||||
|
|
||||||
|
def editops_fast(s1: str, s2: str):
|
||||||
|
"""Return sequence of edit operations transforming one string to another.
|
||||||
|
|
||||||
|
Also see `editops()`.
|
||||||
|
|
||||||
|
The difference is that this implementation does not care about grapheme clusters or
|
||||||
|
unicode normalization, assuming that this already has been done in preprocessing.
|
||||||
|
"""
|
||||||
|
return c_editops(s1, s2)
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
from __future__ import division, print_function
|
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from .. import levenshtein, distance
|
from .. import distance, distance_fast
|
||||||
|
|
||||||
|
|
||||||
TEST_PARAMS = "seq1,seq2,expected_dist"
|
TEST_PARAMS = "s1,s2,expected_dist"
|
||||||
|
|
||||||
TEST_STRINGS = [
|
TEST_STRINGS = [
|
||||||
("a", "a", 0),
|
("a", "a", 0),
|
||||||
|
@ -24,30 +22,45 @@ TEST_STRINGS = [
|
||||||
|
|
||||||
TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]
|
TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)]
|
||||||
|
|
||||||
|
TEST_UNICODE = [
|
||||||
|
# Different, decomposed!
|
||||||
|
(unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"), 0),
|
||||||
|
# Same decomposition
|
||||||
|
(
|
||||||
|
# This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
||||||
|
"Schlyñ",
|
||||||
|
# This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||||
|
"Schlym̃",
|
||||||
|
1,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||||
def test_distance_sequences(seq1, seq2, expected_dist):
|
def test_distance_sequences(s1, s2, expected_dist):
|
||||||
dist = distance(seq1, seq2)
|
dist = distance(s1, s2)
|
||||||
assert dist == expected_dist
|
assert dist == expected_dist
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||||
def test_distance(seq1, seq2, expected_dist):
|
def test_distance_strings(s1, s2, expected_dist):
|
||||||
dist = distance(seq1, seq2)
|
dist = distance(s1, s2)
|
||||||
assert dist == expected_dist
|
assert dist == expected_dist
|
||||||
|
|
||||||
|
|
||||||
def test_distance_unicode_wide():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||||
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
def test_distance_fast(s1, s2, expected_dist):
|
||||||
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
dist = distance_fast(s1, s2)
|
||||||
assert distance(word1, word2) == 0
|
assert dist == expected_dist
|
||||||
|
|
||||||
word1 = "Schlyñ"
|
|
||||||
assert (
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
len(word1) == 6
|
def test_editops_fast_unicode(s1, s2, expected_dist):
|
||||||
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
dist = distance_fast(s1, s2)
|
||||||
word2 = "Schlym̃"
|
assert dist != expected_dist
|
||||||
assert (
|
|
||||||
len(word2) == 7
|
|
||||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
assert distance(word1, word2) == 1
|
def test_distance_unicode(s1, s2, expected_dist):
|
||||||
|
dist = distance(s1, s2)
|
||||||
|
assert dist == expected_dist
|
||||||
|
|
|
@ -1,63 +1,86 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from .. import seq_editops, editops
|
import pytest
|
||||||
|
|
||||||
|
from .. import editops, editops_fast
|
||||||
|
|
||||||
|
TEST_PARAMS = "s1,s2,expected_ops"
|
||||||
|
|
||||||
|
TEST_STRINGS = [
|
||||||
|
# trivial
|
||||||
|
("abc", "abc", []),
|
||||||
|
("", "", []),
|
||||||
|
# insert
|
||||||
|
("bc", "abc", [("insert", 0, 0)]),
|
||||||
|
("ac", "abc", [("insert", 1, 1)]),
|
||||||
|
("ab", "abc", [("insert", 2, 2)]),
|
||||||
|
("", "a", [("insert", 0, 0)]),
|
||||||
|
# delete
|
||||||
|
("abcdef", "cdef", [("delete", 0, 0), ("delete", 1, 0)]),
|
||||||
|
("Xabcdef", "Xcdef", [("delete", 1, 1), ("delete", 2, 1)]),
|
||||||
|
("abcdefg", "acdefX", [("delete", 1, 1), ("replace", 6, 5)]),
|
||||||
|
("abcde", "aabcd", [("insert", 1, 1), ("delete", 4, 5)]),
|
||||||
|
("Foo", "", [("delete", 0, 0), ("delete", 1, 0), ("delete", 2, 0)]),
|
||||||
|
(
|
||||||
|
"Foolish",
|
||||||
|
"Foo",
|
||||||
|
[("delete", 3, 3), ("delete", 4, 3), ("delete", 5, 3), ("delete", 6, 3)],
|
||||||
|
),
|
||||||
|
# multiple
|
||||||
|
("bcd", "abce", [("insert", 0, 0), ("replace", 2, 3)]),
|
||||||
|
# ambiguous
|
||||||
|
("bcd", "abcef", [("insert", 0, 0), ("insert", 2, 3), ("replace", 2, 4)]),
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_SEQUENCES = [
|
||||||
|
(["a", "ab"], ["a", "ab", "c"], [("insert", 2, 2)]),
|
||||||
|
(["a", "ab"], ["a", "c"], [("replace", 1, 1)]),
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_UNICODE = [
|
||||||
|
# In these cases, one of the words has a composed form, the other one does not.
|
||||||
|
("Schlyñ", "Schlym̃", [("replace", 5, 5)]),
|
||||||
|
("oͤde", "öde", [("replace", 0, 0)]),
|
||||||
|
# equal
|
||||||
|
(
|
||||||
|
unicodedata.lookup("LATIN SMALL LETTER N")
|
||||||
|
+ unicodedata.lookup("COMBINING TILDE"),
|
||||||
|
unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE"),
|
||||||
|
[],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_trivial():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||||
assert seq_editops("abc", "abc") == []
|
def test_editops_strings(s1, s2, expected_ops):
|
||||||
assert seq_editops("", "") == []
|
ops = editops(s1, s2)
|
||||||
|
assert ops == expected_ops
|
||||||
|
|
||||||
|
|
||||||
def test_insert():
|
@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES])
|
||||||
assert seq_editops("bc", "abc") == [("insert", 0, 0)]
|
def test_editops_sequences(s1, s2, expected_ops):
|
||||||
assert seq_editops("ac", "abc") == [("insert", 1, 1)]
|
ops = editops(s1, s2)
|
||||||
assert seq_editops("ab", "abc") == [("insert", 2, 2)]
|
assert ops == expected_ops
|
||||||
assert seq_editops("", "a") == [("insert", 0, 0)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS)
|
||||||
assert seq_editops("bcd", "abce") == [("insert", 0, 0), ("replace", 2, 3)]
|
def test_editops_fast(s1, s2, expected_ops):
|
||||||
|
ops = editops_fast(s1, s2)
|
||||||
|
assert ops == expected_ops
|
||||||
|
|
||||||
|
|
||||||
def test_delete():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
assert seq_editops("abcdef", "cdef") == [("delete", 0, 0), ("delete", 1, 0)]
|
def test_editops_fast_unicode(s1, s2, expected_ops):
|
||||||
assert seq_editops("Xabcdef", "Xcdef") == [("delete", 1, 1), ("delete", 2, 1)]
|
ops = editops_fast(s1, s2)
|
||||||
assert seq_editops("abcdefg", "acdefX") == [("delete", 1, 1), ("replace", 6, 5)]
|
assert ops != expected_ops
|
||||||
assert seq_editops("abcde", "aabcd") == [("insert", 1, 1), ("delete", 4, 5)]
|
|
||||||
assert seq_editops("Foo", "") == [
|
|
||||||
("delete", 0, 0),
|
|
||||||
("delete", 1, 0),
|
|
||||||
("delete", 2, 0),
|
|
||||||
]
|
|
||||||
assert seq_editops("Foolish", "Foo") == [
|
|
||||||
("delete", 3, 3),
|
|
||||||
("delete", 4, 3),
|
|
||||||
("delete", 5, 3),
|
|
||||||
("delete", 6, 3),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_ambiguous():
|
@pytest.mark.parametrize(TEST_PARAMS, TEST_UNICODE)
|
||||||
assert seq_editops("bcd", "abcef") == [
|
def test_editops_unicode(s1, s2, expected_ops):
|
||||||
("insert", 0, 0),
|
|
||||||
("replace", 2, 3),
|
|
||||||
("insert", 3, 4),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_editops():
|
|
||||||
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
"""Test editops() in cases where dealing with grapheme clusters matters"""
|
||||||
|
|
||||||
# In these cases, one of the words has a composed form, the other one does not.
|
if not expected_ops:
|
||||||
assert editops("Schlyñ", "Schlym̃") == [("replace", 5, 5)]
|
assert s1 != s2
|
||||||
assert editops("oͤde", "öde") == [("replace", 0, 0)]
|
assert unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2)
|
||||||
|
ops = editops(s1, s2)
|
||||||
|
assert ops == expected_ops
|
||||||
def test_editops_canonically_equivalent():
|
|
||||||
left = unicodedata.lookup("LATIN SMALL LETTER N") + unicodedata.lookup(
|
|
||||||
"COMBINING TILDE"
|
|
||||||
)
|
|
||||||
right = unicodedata.lookup("LATIN SMALL LETTER N WITH TILDE")
|
|
||||||
assert left != right
|
|
||||||
assert unicodedata.normalize("NFC", left) == unicodedata.normalize("NFC", right)
|
|
||||||
assert editops(left, right) == []
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from multimethod import multimethod
|
||||||
|
|
||||||
import uniseg.wordbreak
|
import uniseg.wordbreak
|
||||||
|
|
||||||
from .edit_distance import levenshtein
|
from .edit_distance import distance
|
||||||
from . import ExtractedText
|
from . import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
|
||||||
reference_seq = list(reference)
|
reference_seq = list(reference)
|
||||||
compared_seq = list(compared)
|
compared_seq = list(compared)
|
||||||
|
|
||||||
d = levenshtein(reference_seq, compared_seq)
|
d = distance(reference_seq, compared_seq)
|
||||||
n = len(reference_seq)
|
n = len(reference_seq)
|
||||||
|
|
||||||
if d == 0:
|
if d == 0:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue