1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-02 23:19:58 +02:00
This commit is contained in:
Robert Sachunsky 2025-04-17 14:44:32 +00:00 committed by GitHub
commit 30d9917115
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 32 additions and 39 deletions

View file

@ -25,7 +25,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
runs-on: "ubuntu-latest" runs-on: "ubuntu-latest"

View file

@ -10,7 +10,7 @@ authors = [
description = "An OCR evaluation tool" description = "An OCR evaluation tool"
readme = "README.md" readme = "README.md"
license.file = "LICENSE" license.file = "LICENSE"
requires-python = ">=3.9" requires-python = ">=3.8"
keywords = ["qurator", "ocr", "evaluation", "ocr-d"] keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
dynamic = ["version", "dependencies", "optional-dependencies"] dynamic = ["version", "dependencies", "optional-dependencies"]

View file

@ -1,7 +1,7 @@
click click
jinja2 jinja2
lxml lxml
uniseg >= 0.9.1 uniseg >= 0.8.0
numpy numpy
colorama colorama
MarkupSafe MarkupSafe

View file

@ -20,14 +20,7 @@ def character_error_rate_n(
:return: character error rate and length of the reference :return: character error rate and length of the reference
""" """
d = distance(reference, compared) return distance(reference, compared), len(reference)
n = len(reference)
if d == 0:
return 0, n
if n == 0:
return float("inf"), n
return d / n, n
# XXX Should we really count newlines here? # XXX Should we really count newlines here?

View file

@ -9,18 +9,18 @@ from .extracted_text import ExtractedText
@multimethod @multimethod
def distance(seq1: List[str], seq2: List[str]) -> int: def distance(seq1: List[str], seq2: List[str]) -> float:
"""Compute the Levenshtein edit distance between two lists of grapheme clusters. """Compute the Levenshtein edit distance between two lists of grapheme clusters.
This assumes that the grapheme clusters are already normalized. This assumes that the grapheme clusters are already normalized.
Use distance(str, str) instead if you need to compare two Unicode strings. Use distance(str, str) instead if you need to compare two Unicode strings.
""" """
return Levenshtein.distance(seq1, seq2) return Levenshtein.normalized_distance(seq1, seq2)
@distance.register @distance.register
def _(s1: str, s2: str) -> int: def _(s1: str, s2: str) -> float:
"""Compute the Levenshtein edit distance between two Unicode strings """Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode Note that this is different from levenshtein() as this function knows about Unicode
@ -29,12 +29,12 @@ def _(s1: str, s2: str) -> int:
""" """
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
return Levenshtein.distance(seq1, seq2) return Levenshtein.normalized_distance(seq1, seq2)
@distance.register @distance.register
def _(s1: ExtractedText, s2: ExtractedText) -> int: def _(s1: ExtractedText, s2: ExtractedText) -> float:
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters) return Levenshtein.normalized_distance(s1.grapheme_clusters, s2.grapheme_clusters)
def editops(word1, word2): def editops(word1, word2):

View file

@ -14,9 +14,9 @@ def test_character_error_rate():
assert character_error_rate("Foo", "") == 3 / 3 assert character_error_rate("Foo", "") == 3 / 3
assert character_error_rate("", "") == 0 assert character_error_rate("", "") == 0
assert math.isinf(character_error_rate("", "Foo")) assert character_error_rate("", "Foo") == 3 / 3
assert character_error_rate("Foo", "Food") == 1 / 3 assert character_error_rate("Foo", "Food") == 1 / 4
assert character_error_rate("Fnord", "Food") == 2 / 5 assert character_error_rate("Fnord", "Food") == 2 / 5
assert character_error_rate("Müll", "Mull") == 1 / 4 assert character_error_rate("Müll", "Mull") == 1 / 4
assert character_error_rate("Abstand", "Sand") == 4 / 7 assert character_error_rate("Abstand", "Sand") == 4 / 7

View file

@ -6,8 +6,8 @@ from .. import distance
def test_distance(): def test_distance():
assert distance("Fnord", "Food") == 2 assert distance("Fnord", "Food") == 2 / 5
assert distance("Müll", "Mull") == 1 assert distance("Müll", "Mull") == 1 / 4
word1 = unicodedata.normalize("NFC", "Schlyñ") word1 = unicodedata.normalize("NFC", "Schlyñ")
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed! word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
@ -21,4 +21,4 @@ def test_distance():
assert ( assert (
len(word2) == 7 len(word2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
assert distance(word1, word2) == 1 assert distance(word1, word2) == 1 / 6

View file

@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
) )
) )
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified

View file

@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
with working_directory(tmp_path): with working_directory(tmp_path):
with open("gt.txt", "w") as gtf: with open("gt.txt", "w") as gtf:
gtf.write("") # Empty to yield CER == inf gtf.write("")
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("Not important") ocrf.write("Not important")
process("gt.txt", "ocr.txt", "report") process("gt.txt", "ocr.txt", "report")
with open("report.json", "r") as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j["cer"] == pytest.approx(float("inf")) assert j["cer"] == pytest.approx(1.0)

View file

@ -17,7 +17,7 @@ def test_distance_between_page_files():
# → 2 differences # → 2 differences
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
assert distance(gt, ocr) == 2 assert distance(gt, ocr) == 2 / 827
@pytest.mark.integration @pytest.mark.integration
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
) )
) )
assert distance(gt, ocr) == 8 # Manually verified assert distance(gt, ocr) == 8 / 594 # Manually verified

View file

@ -12,9 +12,9 @@ from .util import working_directory
@pytest.mark.parametrize( @pytest.mark.parametrize(
"gt_file_content,ocr_file_content,cer_expected", "gt_file_content,ocr_file_content,cer_expected",
[ [
("", "Lorem ipsum", math.inf), ("", "Lorem ipsum", 1.0),
("Lorem ipsum", "", 1.0), ("Lorem ipsum", "", 1.0),
("\ufeff", "Lorem ipsum", math.inf), ("\ufeff", "Lorem ipsum", 1.0),
("Lorem ipsum", "\ufeff", 1.0), ("Lorem ipsum", "\ufeff", 1.0),
("", "", 0.0), ("", "", 0.0),
("\ufeff", "", 0.0), ("\ufeff", "", 0.0),

View file

@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
) )
assert ( assert (
word_error_rate(gt, ocr) == 7 / gt_word_count word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
) # Manually verified, 6 words are wrong, 1 got split (=2 errors) ) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

View file

@ -76,7 +76,7 @@ def test_word_error_rate():
) )
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4 assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!")) assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
assert word_error_rate("", "") == 0 assert word_error_rate("", "") == 0
assert ( assert (

View file

@ -21,10 +21,15 @@ def patch_word_break():
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
""" """
old_word_break = uniseg.wordbreak.word_break old_word_break = uniseg.wordbreak.word_break
if hasattr(uniseg.wordbreak, 'Word_Break'):
aletter = uniseg.wordbreak.Word_Break.ALetter
else:
# uniseg<0.9
aletter = uniseg.wordbreak.WordBreak.ALETTER
def new_word_break(c): def new_word_break(c):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
return uniseg.wordbreak.Word_Break.ALetter return aletter
else: else:
return old_word_break(c) return old_word_break(c)
@ -96,15 +101,10 @@ def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
reference_seq = list(reference) reference_seq = list(reference)
compared_seq = list(compared) compared_seq = list(compared)
d = Levenshtein.distance(reference_seq, compared_seq) d = Levenshtein.normalized_distance(reference_seq, compared_seq)
n = len(reference_seq) n = len(reference_seq)
if d == 0: return d, n
return 0, n
if n == 0:
return float("inf"), n
return d / n, n
def word_error_rate(reference: T, compared: T) -> float: def word_error_rate(reference: T, compared: T) -> float:
wer: float wer: float