1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-06-10 04:09:59 +02:00

adapt tests

This commit is contained in:
Robert Sachunsky 2025-03-20 19:35:19 +01:00 committed by Mike Gerber
parent 0583d8c0f0
commit 7a79bae6fd
8 changed files with 14 additions and 14 deletions

View file

@ -14,9 +14,9 @@ def test_character_error_rate():
assert character_error_rate("Foo", "") == 3 / 3
assert character_error_rate("", "") == 0
assert math.isinf(character_error_rate("", "Foo"))
assert character_error_rate("", "Foo") == 3 / 3
assert character_error_rate("Foo", "Food") == 1 / 3
assert character_error_rate("Foo", "Food") == 1 / 4
assert character_error_rate("Fnord", "Food") == 2 / 5
assert character_error_rate("Müll", "Mull") == 1 / 4
assert character_error_rate("Abstand", "Sand") == 4 / 7

View file

@ -6,8 +6,8 @@ from .. import distance
def test_distance():
assert distance("Fnord", "Food") == 2
assert distance("Müll", "Mull") == 1
assert distance("Fnord", "Food") == 2 / 5
assert distance("Müll", "Mull") == 1 / 4
word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
@ -21,4 +21,4 @@ def test_distance():
assert (
len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1
assert distance(word1, word2) == 1 / 6

View file

@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
)
)
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified

View file

@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
with working_directory(tmp_path):
with open("gt.txt", "w") as gtf:
gtf.write("") # Empty to yield CER == inf
gtf.write("")
with open("ocr.txt", "w") as ocrf:
ocrf.write("Not important")
process("gt.txt", "ocr.txt", "report")
with open("report.json", "r") as jsonf:
j = json.load(jsonf)
assert j["cer"] == pytest.approx(float("inf"))
assert j["cer"] == pytest.approx(1.0)

View file

@ -17,7 +17,7 @@ def test_distance_between_page_files():
# → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2
assert distance(gt, ocr) == 2 / 827
@pytest.mark.integration
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
)
)
assert distance(gt, ocr) == 8 # Manually verified
assert distance(gt, ocr) == 8 / 594 # Manually verified

View file

@ -12,9 +12,9 @@ from .util import working_directory
@pytest.mark.parametrize(
"gt_file_content,ocr_file_content,cer_expected",
[
("", "Lorem ipsum", math.inf),
("", "Lorem ipsum", 1.0),
("Lorem ipsum", "", 1.0),
("\ufeff", "Lorem ipsum", math.inf),
("\ufeff", "Lorem ipsum", 1.0),
("Lorem ipsum", "\ufeff", 1.0),
("", "", 0.0),
("\ufeff", "", 0.0),

View file

@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
)
assert (
word_error_rate(gt, ocr) == 7 / gt_word_count
word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

View file

@ -76,7 +76,7 @@ def test_word_error_rate():
)
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
assert word_error_rate("", "") == 0
assert (