1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-13 04:19:59 +02:00
dinglehopper/qurator/dinglehopper/tests/test_character_error_rate.py
Gerber, Mike f94e8b9b1c Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector"
This reverts commit a3c1eee8f31349edcfb1e36920763bcecceb1129, reversing
changes made to dc76213ffc1fbabc2c45f0e52ced55449bdf2e83.
2019-12-09 12:44:05 +01:00

37 lines
1.3 KiB
Python

from __future__ import division, print_function
import math
import unicodedata
from .. import character_error_rate
def test_character_error_rate():
assert character_error_rate('a', 'a') == 0
assert character_error_rate('a', 'b') == 1/1
assert character_error_rate('Foo', 'Bar') == 3/3
assert character_error_rate('Foo', '') == 3/3
assert character_error_rate('', '') == 0
assert math.isinf(character_error_rate('', 'Foo'))
assert character_error_rate('Foo', 'Food') == 1/3
assert character_error_rate('Fnord', 'Food') == 2/5
assert character_error_rate('Müll', 'Mull') == 1/4
assert character_error_rate('Abstand', 'Sand') == 4/7
def test_character_error_rate_hard():
s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!') # Different, decomposed!
assert character_error_rate(s1, s2) == 1/19
s1 = 'Schlyñ'
assert len(s1) == 6 # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
s2 = 'Schlym̃'
assert len(s2) == 7 # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
assert character_error_rate(s2, s1) == 1/6
assert character_error_rate(s1, s2) == 1/6