You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
1.4 KiB
Python
42 lines
1.4 KiB
Python
from __future__ import division, print_function
|
|
|
|
import math
|
|
import unicodedata
|
|
|
|
from .. import character_error_rate
|
|
|
|
|
|
def test_character_error_rate():
|
|
assert character_error_rate("a", "a") == 0
|
|
assert character_error_rate("a", "b") == 1 / 1
|
|
assert character_error_rate("Foo", "Bar") == 3 / 3
|
|
|
|
assert character_error_rate("Foo", "") == 3 / 3
|
|
|
|
assert character_error_rate("", "") == 0
|
|
assert math.isinf(character_error_rate("", "Foo"))
|
|
|
|
assert character_error_rate("Foo", "Food") == 1 / 3
|
|
assert character_error_rate("Fnord", "Food") == 2 / 5
|
|
assert character_error_rate("Müll", "Mull") == 1 / 4
|
|
assert character_error_rate("Abstand", "Sand") == 4 / 7
|
|
|
|
|
|
def test_character_error_rate_hard():
|
|
s1 = unicodedata.normalize("NFC", "Schlyñ lorem ipsum.")
|
|
s2 = unicodedata.normalize("NFD", "Schlyñ lorem ipsum!") # Different, decomposed!
|
|
assert character_error_rate(s1, s2) == 1 / 19
|
|
|
|
s1 = "Schlyñ"
|
|
assert (
|
|
len(s1) == 6
|
|
) # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
|
|
s2 = "Schlym̃"
|
|
assert (
|
|
len(s2) == 7
|
|
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
|
|
|
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
|
assert character_error_rate(s2, s1) == 1 / 6
|
|
assert character_error_rate(s1, s2) == 1 / 6
|