dinglehopper/qurator/dinglehopper/tests/test_character_error_rate.py

from __future__ import division, print_function

import math
import unicodedata

from .. import character_error_rate


def test_character_error_rate():
    assert character_error_rate('a', 'a') == 0
    assert character_error_rate('a', 'b') == 1/1
    assert character_error_rate('Foo', 'Bar') == 3/3

    assert character_error_rate('Foo', '') == 3/3

    assert character_error_rate('', '') == 0
    assert math.isinf(character_error_rate('', 'Foo'))

    assert character_error_rate('Foo', 'Food') == 1/3
    assert character_error_rate('Fnord', 'Food') == 2/5
    assert character_error_rate('Müll', 'Mull') == 1/4
    assert character_error_rate('Abstand', 'Sand') == 4/7


def test_character_error_rate_hard():
    s1 = unicodedata.normalize('NFC', 'Schlyñ lorem ipsum.')
    s2 = unicodedata.normalize('NFD', 'Schlyñ lorem ipsum!')  # Different, decomposed!
    assert character_error_rate(s1, s2) == 1/19

    s1 = 'Schlyñ'
    assert len(s1) == 6  # This ends with LATIN SMALL LETTER N WITH TILDE, so 6 code points
    s2 = 'Schlym̃'
    assert len(s2) == 7  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points

    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
    assert character_error_rate(s2, s1) == 1/6
    assert character_error_rate(s1, s2) == 1/6