mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-08 11:20:26 +02:00
adapt tests
This commit is contained in:
parent
ca5de5729d
commit
a33b713f36
8 changed files with 14 additions and 14 deletions
|
@ -14,9 +14,9 @@ def test_character_error_rate():
|
||||||
assert character_error_rate("Foo", "") == 3 / 3
|
assert character_error_rate("Foo", "") == 3 / 3
|
||||||
|
|
||||||
assert character_error_rate("", "") == 0
|
assert character_error_rate("", "") == 0
|
||||||
assert math.isinf(character_error_rate("", "Foo"))
|
assert character_error_rate("", "Foo") == 3 / 3
|
||||||
|
|
||||||
assert character_error_rate("Foo", "Food") == 1 / 3
|
assert character_error_rate("Foo", "Food") == 1 / 4
|
||||||
assert character_error_rate("Fnord", "Food") == 2 / 5
|
assert character_error_rate("Fnord", "Food") == 2 / 5
|
||||||
assert character_error_rate("Müll", "Mull") == 1 / 4
|
assert character_error_rate("Müll", "Mull") == 1 / 4
|
||||||
assert character_error_rate("Abstand", "Sand") == 4 / 7
|
assert character_error_rate("Abstand", "Sand") == 4 / 7
|
||||||
|
|
|
@ -6,8 +6,8 @@ from .. import distance
|
||||||
|
|
||||||
|
|
||||||
def test_distance():
|
def test_distance():
|
||||||
assert distance("Fnord", "Food") == 2
|
assert distance("Fnord", "Food") == 2 / 5
|
||||||
assert distance("Müll", "Mull") == 1
|
assert distance("Müll", "Mull") == 1 / 4
|
||||||
|
|
||||||
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
word1 = unicodedata.normalize("NFC", "Schlyñ")
|
||||||
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
|
||||||
|
@ -21,4 +21,4 @@ def test_distance():
|
||||||
assert (
|
assert (
|
||||||
len(word2) == 7
|
len(word2) == 7
|
||||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||||
assert distance(word1, word2) == 1
|
assert distance(word1, word2) == 1 / 6
|
||||||
|
|
|
@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
|
assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified
|
||||||
|
|
|
@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
|
||||||
|
|
||||||
with working_directory(tmp_path):
|
with working_directory(tmp_path):
|
||||||
with open("gt.txt", "w") as gtf:
|
with open("gt.txt", "w") as gtf:
|
||||||
gtf.write("") # Empty to yield CER == inf
|
gtf.write("")
|
||||||
with open("ocr.txt", "w") as ocrf:
|
with open("ocr.txt", "w") as ocrf:
|
||||||
ocrf.write("Not important")
|
ocrf.write("Not important")
|
||||||
|
|
||||||
process("gt.txt", "ocr.txt", "report")
|
process("gt.txt", "ocr.txt", "report")
|
||||||
with open("report.json", "r") as jsonf:
|
with open("report.json", "r") as jsonf:
|
||||||
j = json.load(jsonf)
|
j = json.load(jsonf)
|
||||||
assert j["cer"] == pytest.approx(float("inf"))
|
assert j["cer"] == pytest.approx(1.0)
|
||||||
|
|
|
@ -17,7 +17,7 @@ def test_distance_between_page_files():
|
||||||
# → 2 differences
|
# → 2 differences
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
||||||
assert distance(gt, ocr) == 2
|
assert distance(gt, ocr) == 2 / 827
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
|
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert distance(gt, ocr) == 8 # Manually verified
|
assert distance(gt, ocr) == 8 / 594 # Manually verified
|
||||||
|
|
|
@ -12,9 +12,9 @@ from .util import working_directory
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"gt_file_content,ocr_file_content,cer_expected",
|
"gt_file_content,ocr_file_content,cer_expected",
|
||||||
[
|
[
|
||||||
("", "Lorem ipsum", math.inf),
|
("", "Lorem ipsum", 1.0),
|
||||||
("Lorem ipsum", "", 1.0),
|
("Lorem ipsum", "", 1.0),
|
||||||
("\ufeff", "Lorem ipsum", math.inf),
|
("\ufeff", "Lorem ipsum", 1.0),
|
||||||
("Lorem ipsum", "\ufeff", 1.0),
|
("Lorem ipsum", "\ufeff", 1.0),
|
||||||
("", "", 0.0),
|
("", "", 0.0),
|
||||||
("\ufeff", "", 0.0),
|
("\ufeff", "", 0.0),
|
||||||
|
|
|
@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
word_error_rate(gt, ocr) == 7 / gt_word_count
|
word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
|
||||||
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)
|
||||||
|
|
|
@ -76,7 +76,7 @@ def test_word_error_rate():
|
||||||
)
|
)
|
||||||
|
|
||||||
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
|
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
|
||||||
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
|
assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
|
||||||
assert word_error_rate("", "") == 0
|
assert word_error_rate("", "") == 0
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue