diff --git a/src/dinglehopper/tests/test_character_error_rate.py b/src/dinglehopper/tests/test_character_error_rate.py index 970f740..63d2f72 100644 --- a/src/dinglehopper/tests/test_character_error_rate.py +++ b/src/dinglehopper/tests/test_character_error_rate.py @@ -14,9 +14,9 @@ def test_character_error_rate(): assert character_error_rate("Foo", "") == 3 / 3 assert character_error_rate("", "") == 0 - assert math.isinf(character_error_rate("", "Foo")) + assert character_error_rate("", "Foo") == 3 / 3 - assert character_error_rate("Foo", "Food") == 1 / 3 + assert character_error_rate("Foo", "Food") == 1 / 4 assert character_error_rate("Fnord", "Food") == 2 / 5 assert character_error_rate("Müll", "Mull") == 1 / 4 assert character_error_rate("Abstand", "Sand") == 4 / 7 diff --git a/src/dinglehopper/tests/test_edit_distance.py b/src/dinglehopper/tests/test_edit_distance.py index be427a8..e782ca6 100644 --- a/src/dinglehopper/tests/test_edit_distance.py +++ b/src/dinglehopper/tests/test_edit_distance.py @@ -6,8 +6,8 @@ from .. import distance def test_distance(): - assert distance("Fnord", "Food") == 2 - assert distance("Müll", "Mull") == 1 + assert distance("Fnord", "Food") == 2 / 5 + assert distance("Müll", "Mull") == 1 / 4 word1 = unicodedata.normalize("NFC", "Schlyñ") word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! @@ -21,4 +21,4 @@ def test_distance(): assert ( len(word2) == 7 ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points - assert distance(word1, word2) == 1 + assert distance(word1, word2) == 1 / 6 diff --git a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py index 7755e2d..b3a5914 100644 --- a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2(): ) ) - assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified + assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_json.py index 6cbfa0c..a993ee7 100644 --- a/src/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/src/dinglehopper/tests/test_integ_cli_valid_json.py @@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path): with working_directory(tmp_path): with open("gt.txt", "w") as gtf: - gtf.write("") # Empty to yield CER == inf + gtf.write("") with open("ocr.txt", "w") as ocrf: ocrf.write("Not important") process("gt.txt", "ocr.txt", "report") with open("report.json", "r") as jsonf: j = json.load(jsonf) - assert j["cer"] == pytest.approx(float("inf")) + assert j["cer"] == pytest.approx(1.0) diff --git a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py index e01ac76..b032e22 100644 --- a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -17,7 +17,7 @@ def test_distance_between_page_files(): # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) - assert distance(gt, ocr) == 2 + assert distance(gt, ocr) == 2 / 827 @pytest.mark.integration @@ -52,4 +52,4 @@ def test_distance_between_page_alto_2(): ) ) - assert distance(gt, ocr) == 8 # Manually verified + assert distance(gt, ocr) == 8 / 594 # Manually verified diff --git a/src/dinglehopper/tests/test_integ_empty_files.py b/src/dinglehopper/tests/test_integ_empty_files.py index 5c90ed1..2b29513 100644 --- a/src/dinglehopper/tests/test_integ_empty_files.py +++ b/src/dinglehopper/tests/test_integ_empty_files.py @@ -12,9 +12,9 @@ from .util import working_directory @pytest.mark.parametrize( "gt_file_content,ocr_file_content,cer_expected", [ - ("", "Lorem ipsum", math.inf), + ("", "Lorem ipsum", 1.0), ("Lorem ipsum", "", 1.0), - ("\ufeff", "Lorem ipsum", math.inf), + ("\ufeff", "Lorem ipsum", 1.0), ("Lorem ipsum", "\ufeff", 1.0), ("", "", 0.0), ("\ufeff", "", 0.0), diff --git a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 8a57ed2..f114685 100644 --- a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2(): ) assert ( - word_error_rate(gt, ocr) == 7 / gt_word_count + word_error_rate(gt, ocr) == 7 / (gt_word_count + 1) ) # Manually verified, 6 words are wrong, 1 got split (=2 errors) diff --git a/src/dinglehopper/tests/test_word_error_rate.py b/src/dinglehopper/tests/test_word_error_rate.py index 311ffff..245fa74 100644 --- a/src/dinglehopper/tests/test_word_error_rate.py +++ b/src/dinglehopper/tests/test_word_error_rate.py @@ -76,7 +76,7 @@ def test_word_error_rate(): ) assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 - assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) + assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4 assert word_error_rate("", "") == 0 assert (