mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-10 04:09:59 +02:00
adapt tests
This commit is contained in:
parent
0583d8c0f0
commit
7a79bae6fd
8 changed files with 14 additions and 14 deletions
|
@ -14,9 +14,9 @@ def test_character_error_rate():
|
|||
assert character_error_rate("Foo", "") == 3 / 3
|
||||
|
||||
assert character_error_rate("", "") == 0
|
||||
assert math.isinf(character_error_rate("", "Foo"))
|
||||
assert character_error_rate("", "Foo") == 3 / 3
|
||||
|
||||
assert character_error_rate("Foo", "Food") == 1 / 3
|
||||
assert character_error_rate("Foo", "Food") == 1 / 4
|
||||
assert character_error_rate("Fnord", "Food") == 2 / 5
|
||||
assert character_error_rate("Müll", "Mull") == 1 / 4
|
||||
assert character_error_rate("Abstand", "Sand") == 4 / 7
|
||||
|
|
|
@ -6,8 +6,8 @@ from .. import distance
|
|||
|
||||
|
||||
def test_distance():
|
||||
assert distance("Fnord", "Food") == 2
|
||||
assert distance("Müll", "Mull") == 1
|
||||
assert distance("Fnord", "Food") == 2 / 5
|
||||
assert distance("Müll", "Mull") == 1 / 4
|
||||
|
||||
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
||||
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
||||
|
@ -21,4 +21,4 @@ def test_distance():
|
|||
assert (
|
||||
len(word2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
assert distance(word1, word2) == 1
|
||||
assert distance(word1, word2) == 1 / 6
|
||||
|
|
|
@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
|
|||
)
|
||||
)
|
||||
|
||||
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
|
||||
assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified
|
||||
|
|
|
@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
|
|||
|
||||
with working_directory(tmp_path):
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write("") # Empty to yield CER == inf
|
||||
gtf.write("")
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("Not important")
|
||||
|
||||
process("gt.txt", "ocr.txt", "report")
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
assert j["cer"] == pytest.approx(float("inf"))
|
||||
assert j["cer"] == pytest.approx(1.0)
|
||||
|
|
|
@ -17,7 +17,7 @@ def test_distance_between_page_files():
|
|||
# → 2 differences
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||
assert distance(gt, ocr) == 2
|
||||
assert distance(gt, ocr) == 2 / 827
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
|
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
|
|||
)
|
||||
)
|
||||
|
||||
assert distance(gt, ocr) == 8 # Manually verified
|
||||
assert distance(gt, ocr) == 8 / 594 # Manually verified
|
||||
|
|
|
@ -12,9 +12,9 @@ from .util import working_directory
|
|||
@pytest.mark.parametrize(
|
||||
"gt_file_content,ocr_file_content,cer_expected",
|
||||
[
|
||||
("", "Lorem ipsum", math.inf),
|
||||
("", "Lorem ipsum", 1.0),
|
||||
("Lorem ipsum", "", 1.0),
|
||||
("\ufeff", "Lorem ipsum", math.inf),
|
||||
("\ufeff", "Lorem ipsum", 1.0),
|
||||
("Lorem ipsum", "\ufeff", 1.0),
|
||||
("", "", 0.0),
|
||||
("\ufeff", "", 0.0),
|
||||
|
|
|
@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
|
|||
)
|
||||
|
||||
assert (
|
||||
word_error_rate(gt, ocr) == 7 / gt_word_count
|
||||
word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
|
||||
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
||||
|
|
|
@ -76,7 +76,7 @@ def test_word_error_rate():
|
|||
)
|
||||
|
||||
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
|
||||
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
|
||||
assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
|
||||
assert word_error_rate("", "") == 0
|
||||
|
||||
assert (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue