From 0583d8c0f04ae6ffef65f833d1b1e573ef61208d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 20 Mar 2025 19:35:12 +0100 Subject: [PATCH 1/5] use Levenshtein.normalized_distance instead of distance --- src/dinglehopper/character_error_rate.py | 9 +-------- src/dinglehopper/edit_distance.py | 12 ++++++------ src/dinglehopper/word_error_rate.py | 9 ++------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/dinglehopper/character_error_rate.py b/src/dinglehopper/character_error_rate.py index 88a88f8..04e4bfe 100644 --- a/src/dinglehopper/character_error_rate.py +++ b/src/dinglehopper/character_error_rate.py @@ -20,14 +20,7 @@ def character_error_rate_n( :return: character error rate and length of the reference """ - d = distance(reference, compared) - n = len(reference) - - if d == 0: - return 0, n - if n == 0: - return float("inf"), n - return d / n, n + return distance(reference, compared), len(reference) # XXX Should we really count newlines here? diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py index ec564ae..988849c 100644 --- a/src/dinglehopper/edit_distance.py +++ b/src/dinglehopper/edit_distance.py @@ -9,18 +9,18 @@ from .extracted_text import ExtractedText @multimethod -def distance(seq1: List[str], seq2: List[str]) -> int: +def distance(seq1: List[str], seq2: List[str]) -> float: """Compute the Levenshtein edit distance between two lists of grapheme clusters. This assumes that the grapheme clusters are already normalized. Use distance(str, str) instead if you need to compare two Unicode strings. """ - return Levenshtein.distance(seq1, seq2) + return Levenshtein.normalized_distance(seq1, seq2) @distance.register -def _(s1: str, s2: str) -> int: +def _(s1: str, s2: str) -> float: """Compute the Levenshtein edit distance between two Unicode strings Note that this is different from levenshtein() as this function knows about Unicode @@ -29,12 +29,12 @@ def _(s1: str, s2: str) -> int: """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - return Levenshtein.distance(seq1, seq2) + return Levenshtein.normalized_distance(seq1, seq2) @distance.register -def _(s1: ExtractedText, s2: ExtractedText) -> int: - return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters) +def _(s1: ExtractedText, s2: ExtractedText) -> float: + return Levenshtein.normalized_distance(s1.grapheme_clusters, s2.grapheme_clusters) def editops(word1, word2): diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index ec039b3..abaa168 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -96,15 +96,10 @@ def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]: reference_seq = list(reference) compared_seq = list(compared) - d = Levenshtein.distance(reference_seq, compared_seq) + d = Levenshtein.normalized_distance(reference_seq, compared_seq) n = len(reference_seq) - if d == 0: - return 0, n - if n == 0: - return float("inf"), n - return d / n, n - + return d, n def word_error_rate(reference: T, compared: T) -> float: wer: float From 7a79bae6fd98d7497443c17e691df29493e225da Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 20 Mar 2025 19:35:19 +0100 Subject: [PATCH 2/5] adapt tests --- src/dinglehopper/tests/test_character_error_rate.py | 4 ++-- src/dinglehopper/tests/test_edit_distance.py | 6 +++--- .../tests/test_integ_character_error_rate_ocr.py | 2 +- src/dinglehopper/tests/test_integ_cli_valid_json.py | 4 ++-- src/dinglehopper/tests/test_integ_edit_distance_ocr.py | 4 ++-- src/dinglehopper/tests/test_integ_empty_files.py | 4 ++-- src/dinglehopper/tests/test_integ_word_error_rate_ocr.py | 2 +- src/dinglehopper/tests/test_word_error_rate.py | 2 +- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/dinglehopper/tests/test_character_error_rate.py b/src/dinglehopper/tests/test_character_error_rate.py index 970f740..63d2f72 100644 --- a/src/dinglehopper/tests/test_character_error_rate.py +++ b/src/dinglehopper/tests/test_character_error_rate.py @@ -14,9 +14,9 @@ def test_character_error_rate(): assert character_error_rate("Foo", "") == 3 / 3 assert character_error_rate("", "") == 0 - assert math.isinf(character_error_rate("", "Foo")) + assert character_error_rate("", "Foo") == 3 / 3 - assert character_error_rate("Foo", "Food") == 1 / 3 + assert character_error_rate("Foo", "Food") == 1 / 4 assert character_error_rate("Fnord", "Food") == 2 / 5 assert character_error_rate("Müll", "Mull") == 1 / 4 assert character_error_rate("Abstand", "Sand") == 4 / 7 diff --git a/src/dinglehopper/tests/test_edit_distance.py b/src/dinglehopper/tests/test_edit_distance.py index be427a8..e782ca6 100644 --- a/src/dinglehopper/tests/test_edit_distance.py +++ b/src/dinglehopper/tests/test_edit_distance.py @@ -6,8 +6,8 @@ from .. import distance def test_distance(): - assert distance("Fnord", "Food") == 2 - assert distance("Müll", "Mull") == 1 + assert distance("Fnord", "Food") == 2 / 5 + assert distance("Müll", "Mull") == 1 / 4 word1 = unicodedata.normalize("NFC", "Schlyñ") word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! @@ -21,4 +21,4 @@ def test_distance(): assert ( len(word2) == 7 ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points - assert distance(word1, word2) == 1 + assert distance(word1, word2) == 1 / 6 diff --git a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py index 7755e2d..b3a5914 100644 --- a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py @@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2(): ) ) - assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified + assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_json.py index 6cbfa0c..a993ee7 100644 --- a/src/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/src/dinglehopper/tests/test_integ_cli_valid_json.py @@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path): with working_directory(tmp_path): with open("gt.txt", "w") as gtf: - gtf.write("") # Empty to yield CER == inf + gtf.write("") with open("ocr.txt", "w") as ocrf: ocrf.write("Not important") process("gt.txt", "ocr.txt", "report") with open("report.json", "r") as jsonf: j = json.load(jsonf) - assert j["cer"] == pytest.approx(float("inf")) + assert j["cer"] == pytest.approx(1.0) diff --git a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py index e01ac76..b032e22 100644 --- a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py +++ b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py @@ -17,7 +17,7 @@ def test_distance_between_page_files(): # → 2 differences gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) - assert distance(gt, ocr) == 2 + assert distance(gt, ocr) == 2 / 827 @pytest.mark.integration @@ -52,4 +52,4 @@ def test_distance_between_page_alto_2(): ) ) - assert distance(gt, ocr) == 8 # Manually verified + assert distance(gt, ocr) == 8 / 594 # Manually verified diff --git a/src/dinglehopper/tests/test_integ_empty_files.py b/src/dinglehopper/tests/test_integ_empty_files.py index 5c90ed1..2b29513 100644 --- a/src/dinglehopper/tests/test_integ_empty_files.py +++ b/src/dinglehopper/tests/test_integ_empty_files.py @@ -12,9 +12,9 @@ from .util import working_directory @pytest.mark.parametrize( "gt_file_content,ocr_file_content,cer_expected", [ - ("", "Lorem ipsum", math.inf), + ("", "Lorem ipsum", 1.0), ("Lorem ipsum", "", 1.0), - ("\ufeff", "Lorem ipsum", math.inf), + ("\ufeff", "Lorem ipsum", 1.0), ("Lorem ipsum", "\ufeff", 1.0), ("", "", 0.0), ("\ufeff", "", 0.0), diff --git a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py index 8a57ed2..f114685 100644 --- a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py +++ b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py @@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2(): ) assert ( - word_error_rate(gt, ocr) == 7 / gt_word_count + word_error_rate(gt, ocr) == 7 / (gt_word_count + 1) ) # Manually verified, 6 words are wrong, 1 got split (=2 errors) diff --git a/src/dinglehopper/tests/test_word_error_rate.py b/src/dinglehopper/tests/test_word_error_rate.py index 311ffff..245fa74 100644 --- a/src/dinglehopper/tests/test_word_error_rate.py +++ b/src/dinglehopper/tests/test_word_error_rate.py @@ -76,7 +76,7 @@ def test_word_error_rate(): ) assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 - assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) + assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4 assert word_error_rate("", "") == 0 assert ( From 9e0701d1fe296e89513d3aaa70899586ad77da1d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Apr 2025 16:08:41 +0200 Subject: [PATCH 3/5] opt out of 7f8a8dd5 (uniseg update that requires py39) --- requirements.txt | 2 +- src/dinglehopper/word_error_rate.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 123187b..653ec59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ click jinja2 lxml -uniseg >= 0.9.1 +uniseg >= 0.8.0 numpy colorama MarkupSafe diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index abaa168..514917e 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -21,10 +21,15 @@ def patch_word_break(): https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt """ old_word_break = uniseg.wordbreak.word_break + if hasattr(uniseg.wordbreak, 'Word_Break'): + aletter = uniseg.wordbreak.Word_Break.ALetter + else: + # uniseg<0.9 + aletter = uniseg.wordbreak.WordBreak.ALETTER def new_word_break(c): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area - return uniseg.wordbreak.Word_Break.ALetter + return aletter else: return old_word_break(c) From 7b618921ce7b54b3e15b986e3c33378a76ec7f89 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Apr 2025 16:09:42 +0200 Subject: [PATCH 4/5] CI: reactivate py38 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 277d4ba..387f7a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ] runs-on: "ubuntu-latest" From b7d1cb455a8b2636dd3de880565ac1b045bed1c7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Apr 2025 16:47:13 +0200 Subject: [PATCH 5/5] re-allow py38 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9dabb41..62fae82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ authors = [ description = "An OCR evaluation tool" readme = "README.md" license.file = "LICENSE" -requires-python = ">=3.9" +requires-python = ">=3.8" keywords = ["qurator", "ocr", "evaluation", "ocr-d"] dynamic = ["version", "dependencies", "optional-dependencies"]