adapt tests

Robert Sachunsky 1 month ago
parent ca5de5729d
commit a33b713f36

@ -14,9 +14,9 @@ def test_character_error_rate():
assert character_error_rate("Foo", "") == 3 / 3 assert character_error_rate("Foo", "") == 3 / 3
assert character_error_rate("", "") == 0 assert character_error_rate("", "") == 0
assert math.isinf(character_error_rate("", "Foo")) assert character_error_rate("", "Foo") == 3 / 3
assert character_error_rate("Foo", "Food") == 1 / 3 assert character_error_rate("Foo", "Food") == 1 / 4
assert character_error_rate("Fnord", "Food") == 2 / 5 assert character_error_rate("Fnord", "Food") == 2 / 5
assert character_error_rate("Müll", "Mull") == 1 / 4 assert character_error_rate("Müll", "Mull") == 1 / 4
assert character_error_rate("Abstand", "Sand") == 4 / 7 assert character_error_rate("Abstand", "Sand") == 4 / 7

@ -6,8 +6,8 @@ from .. import distance
def test_distance(): def test_distance():
assert distance("Fnord", "Food") == 2 assert distance("Fnord", "Food") == 2 / 5
assert distance("Müll", "Mull") == 1 assert distance("Müll", "Mull") == 1 / 4
word1 = unicodedata.normalize("NFC", "Schlyñ") word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
@ -21,4 +21,4 @@ def test_distance():
assert ( assert (
len(word2) == 7 len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1 assert distance(word1, word2) == 1 / 6

@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
) )
) )
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified

@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
with working_directory(tmp_path): with working_directory(tmp_path):
with open("gt.txt", "w") as gtf: with open("gt.txt", "w") as gtf:
gtf.write("") # Empty to yield CER == inf gtf.write("")
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("Not important") ocrf.write("Not important")
process("gt.txt", "ocr.txt", "report") process("gt.txt", "ocr.txt", "report")
with open("report.json", "r") as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j["cer"] == pytest.approx(float("inf")) assert j["cer"] == pytest.approx(1.0)

@ -17,7 +17,7 @@ def test_distance_between_page_files():
# → 2 differences # → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2 assert distance(gt, ocr) == 2 / 827
@pytest.mark.integration @pytest.mark.integration
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
) )
) )
assert distance(gt, ocr) == 8 # Manually verified assert distance(gt, ocr) == 8 / 594 # Manually verified

@ -12,9 +12,9 @@ from .util import working_directory
@pytest.mark.parametrize( @pytest.mark.parametrize(
"gt_file_content,ocr_file_content,cer_expected", "gt_file_content,ocr_file_content,cer_expected",
[ [
("", "Lorem ipsum", math.inf), ("", "Lorem ipsum", 1.0),
("Lorem ipsum", "", 1.0), ("Lorem ipsum", "", 1.0),
("\ufeff", "Lorem ipsum", math.inf), ("\ufeff", "Lorem ipsum", 1.0),
("Lorem ipsum", "\ufeff", 1.0), ("Lorem ipsum", "\ufeff", 1.0),
("", "", 0.0), ("", "", 0.0),
("\ufeff", "", 0.0), ("\ufeff", "", 0.0),

@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
) )
assert ( assert (
word_error_rate(gt, ocr) == 7 / gt_word_count word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
) # Manually verified, 6 words are wrong, 1 got split (=2 errors) ) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

@ -76,7 +76,7 @@ def test_word_error_rate():
) )
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
assert word_error_rate("", "") == 0 assert word_error_rate("", "") == 0
assert ( assert (

Loading…
Cancel
Save