diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 0b9c8f4..e6724f5 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -2,7 +2,7 @@ from __future__ import division, print_function import unicodedata from functools import partial, lru_cache -from typing import Sequence, Tuple +from typing import Sequence, Tuple, List import numpy as np from multimethod import multimethod @@ -84,7 +84,7 @@ def distance(s1: str, s2: str): """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - return levenshtein(seq1, seq2) + return distance(seq1, seq2) @multimethod @@ -92,6 +92,11 @@ def distance(s1: ExtractedText, s2: ExtractedText): return distance(s1.text, s2.text) +@multimethod +def distance(s1: List, s2: List): + return levenshtein(s1, s2) + + def seq_editops(seq1, seq2): """ Return sequence of edit operations transforming one sequence to another. diff --git a/qurator/dinglehopper/tests/test_edit_distance.py b/qurator/dinglehopper/tests/test_edit_distance.py index dc1f202..fb50061 100644 --- a/qurator/dinglehopper/tests/test_edit_distance.py +++ b/qurator/dinglehopper/tests/test_edit_distance.py @@ -2,33 +2,42 @@ from __future__ import division, print_function import unicodedata +import pytest + from .. import levenshtein, distance -def test_levenshtein(): - assert levenshtein("a", "a") == 0 - assert levenshtein("a", "b") == 1 - assert levenshtein("Foo", "Bar") == 3 +TEST_PARAMS = "seq1,seq2,expected_dist" + +TEST_STRINGS = [ + ("a", "a", 0), + ("a", "b", 1), + ("Foo", "Bar", 3), + ("", "", 0), + ("Foo", "", 3), + ("", "Foo", 3), + ("Foo", "Food", 1), + ("Fnord", "Food", 2), + ("Müll", "Mull", 1), + ("Abstand", "Sand", 4), +] - assert levenshtein("", "") == 0 - assert levenshtein("Foo", "") == 3 - assert levenshtein("", "Foo") == 3 +TEST_SEQUENCES = [(["a", "ab"], ["a", "ab", "c"], 1), (["a", "ab"], ["a", "c"], 1)] - assert levenshtein("Foo", "Food") == 1 - assert levenshtein("Fnord", "Food") == 2 - assert levenshtein("Müll", "Mull") == 1 - assert levenshtein("Abstand", "Sand") == 4 +@pytest.mark.parametrize(TEST_PARAMS, [*TEST_STRINGS, *TEST_SEQUENCES]) +def test_distance_sequences(seq1, seq2, expected_dist): + dist = distance(seq1, seq2) + assert dist == expected_dist -def test_levenshtein_other_sequences(): - assert levenshtein(["a", "ab"], ["a", "ab", "c"]) == 1 - assert levenshtein(["a", "ab"], ["a", "c"]) == 1 +@pytest.mark.parametrize(TEST_PARAMS, TEST_STRINGS) +def test_distance(seq1, seq2, expected_dist): + dist = distance(seq1, seq2) + assert dist == expected_dist -def test_distance(): - assert distance("Fnord", "Food") == 2 - assert distance("Müll", "Mull") == 1 +def test_distance_unicode_wide(): word1 = unicodedata.normalize("NFC", "Schlyñ") word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! assert distance(word1, word2) == 0