You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
dinglehopper/dinglehopper/tests/test_align.py

184 lines
4.7 KiB
Python

import pytest
from .util import unzip
from .. import align, seq_align, distance
def test_left_empty():
result = list(align("", "foo"))
expected = [(None, "f"), (None, "o"), (None, "o")]
assert result == expected
def test_right_empty():
result = list(align("foo", ""))
expected = [("f", None), ("o", None), ("o", None)]
assert result == expected
def test_left_longer():
result = list(align("food", "foo"))
expected = [("f", "f"), ("o", "o"), ("o", "o"), ("d", None)]
assert result == expected
def test_right_longer():
result = list(align("foo", "food"))
expected = [("f", "f"), ("o", "o"), ("o", "o"), (None, "d")]
assert result == expected
def test_some_diff():
result = list(align("abcde", "aaadef"))
left, right = unzip(result)
assert list(left) == ["a", "b", "c", "d", "e", None]
assert list(right) == ["a", "a", "a", "d", "e", "f"]
def test_longer():
s1 = "Dies ist eine Tst!"
s2 = "Dies ist ein Test."
result = list(align(s1, s2)) # ; diffprint(*unzip(result))
expected = [
("D", "D"),
("i", "i"),
("e", "e"),
("s", "s"),
(" ", " "),
("i", "i"),
("s", "s"),
("t", "t"),
(" ", " "),
("e", "e"),
("i", "i"),
("n", "n"),
("e", None),
(" ", " "),
("T", "T"),
(None, "e"),
("s", "s"),
("t", "t"),
("!", "."),
]
assert result == expected
def test_completely_different():
assert len(list(align("abcde", "fghij"))) == 5
def test_with_some_fake_ocr_errors():
result = list(
align(
"Über die vielen Sorgen wegen desselben vergaß",
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
)
)
left, right = unzip(result)
# Beginning
assert list(left[:18]) == [None] * 18
assert list(right[:18]) == list("SomeJunk MoreJunk ")
# End
assert list(left[-1:]) == ["ß"]
assert list(right[-1:]) == ["b"]
def test_lines():
"""Test comparing list of lines.
This mainly serves as documentation for comparing lists of lines.
"""
result = list(
seq_align(
["This is a line.", "This is another", "And the last line"],
[
"This is a line.",
"This is another",
"J u n k",
"And the last line",
],
)
)
left, right = unzip(result)
assert list(left) == [
"This is a line.",
"This is another",
None,
"And the last line",
]
assert list(right) == [
"This is a line.",
"This is another",
"J u n k",
"And the last line",
]
@pytest.mark.skip(reason="This fails with rapidfuzz <2.6 and is flawed anyway")
# FIXME
# This was based on our own implementation that used __eq__ and not __hash__ as
# rapidfuzz does. Need to review this.
def test_lines_similar():
"""
Test comparing list of lines while using a "weaker equivalence".
This mainly serves as documentation.
"""
class SimilarString:
def __init__(self, string):
self._string = string
def __eq__(self, other):
# Just an example!
min_len = min(len(self._string), len(other._string))
if min_len > 0:
normalized_distance = distance(self._string, other._string) / min_len
similar = normalized_distance < 0.1
else:
similar = False
return similar
def __ne__(self, other):
return not self.__eq__(other)
def __repr__(self):
return "SimilarString('%s')" % self._string
def __hash__(self):
return hash(self._string)
result = list(
seq_align(
[
SimilarString("This is a line."),
SimilarString("This is another"),
SimilarString("And the last line"),
],
[
SimilarString("This is a ljne."),
SimilarString("This is another"),
SimilarString("J u n k"),
SimilarString("And the last line"),
],
)
)
left, right = unzip(result)
assert list(left) == [
SimilarString("This is a line."),
SimilarString("This is another"),
None,
SimilarString("And the last line"),
]
assert list(right) == [
SimilarString("This is a ljne."),
SimilarString("This is another"),
SimilarString("J u n k"),
SimilarString("And the last line"),
]
# Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0]