mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-11-04 11:24:17 +01:00 
			
		
		
		
	🐛 Fix calculation of score_hint for edge cases, e.g. when CER is infinite
If the CER is infinite, we can't calculate a score_hint as an int. Fall back to None in this case.
This commit is contained in:
		
							parent
							
								
									bc95c03127
								
							
						
					
					
						commit
						e256526ea1
					
				
					 4 changed files with 34 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -1,3 +1,6 @@
 | 
			
		|||
import math
 | 
			
		||||
from math import ceil
 | 
			
		||||
 | 
			
		||||
from .edit_distance import *
 | 
			
		||||
from rapidfuzz.distance import Levenshtein
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -8,6 +11,22 @@ def align(t1, t2):
 | 
			
		|||
    return seq_align(s1, s2)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def score_hint(er: float, n: int) -> int | None:
 | 
			
		||||
    """Calculate RapidFuzz score hint for a given error rate and count.
 | 
			
		||||
 | 
			
		||||
    Gives the score hint for the distance functions (= expected distance) or None if
 | 
			
		||||
    the error rate is inf.
 | 
			
		||||
    """
 | 
			
		||||
    assert not math.isnan(er)
 | 
			
		||||
    try:
 | 
			
		||||
        score_hint = int(ceil(er * n))
 | 
			
		||||
    except (OverflowError, ValueError):
 | 
			
		||||
        # ceil(er * n) can be inf or NaN (for n == 0), so int() can throw an
 | 
			
		||||
        # OverflowError and a ValueError.
 | 
			
		||||
        score_hint = None
 | 
			
		||||
    return score_hint
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def seq_align(s1, s2, score_hint=None):
 | 
			
		||||
    """Align general sequences."""
 | 
			
		||||
    s1 = list(s1)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ from math import ceil
 | 
			
		|||
 | 
			
		||||
from .character_error_rate import character_error_rate_n
 | 
			
		||||
from .word_error_rate import word_error_rate_n, words_normalized
 | 
			
		||||
from .align import seq_align
 | 
			
		||||
from .align import seq_align, score_hint
 | 
			
		||||
from .extracted_text import ExtractedText
 | 
			
		||||
from .ocr_files import extract
 | 
			
		||||
from .config import Config
 | 
			
		||||
| 
						 | 
				
			
			@ -110,12 +110,14 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
 | 
			
		|||
 | 
			
		||||
    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
 | 
			
		||||
    char_diff_report = gen_diff_report(
 | 
			
		||||
        gt_text, ocr_text, css_prefix="c", joiner="", none="·", score_hint=int(ceil(cer * n_characters))
 | 
			
		||||
        gt_text, ocr_text, css_prefix="c", joiner="", none="·",
 | 
			
		||||
        score_hint=score_hint(cer, n_characters)
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    wer, n_words = word_error_rate_n(gt_words, ocr_words)
 | 
			
		||||
    word_diff_report = gen_diff_report(
 | 
			
		||||
        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯", score_hint=int(ceil(wer * n_words))
 | 
			
		||||
        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
 | 
			
		||||
        score_hint=score_hint(wer, n_words)
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    env = Environment(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -75,10 +75,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 | 
			
		|||
 | 
			
		||||
        # Generate diff reports
 | 
			
		||||
        char_diff_report += gen_diff_report(
 | 
			
		||||
            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·", score_hint=int(ceil(l_cer * l_n_characters))
 | 
			
		||||
            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
 | 
			
		||||
            score_hint=score_hint(l_cer, l_n_characters)
 | 
			
		||||
        )
 | 
			
		||||
        word_diff_report += gen_diff_report(
 | 
			
		||||
            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯", score_hint=int(ceil(l_wer * l_n_words))
 | 
			
		||||
            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
 | 
			
		||||
            score_hint=score_hint(l_wer, l_n_words))
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    env = Environment(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
import math
 | 
			
		||||
import pytest
 | 
			
		||||
from .util import unzip
 | 
			
		||||
from .. import align, seq_align, distance
 | 
			
		||||
from .. import align, seq_align, distance, score_hint
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_left_empty():
 | 
			
		||||
| 
						 | 
				
			
			@ -181,3 +182,7 @@ def test_lines_similar():
 | 
			
		|||
 | 
			
		||||
    # Test __eq__ (i.e. is it a substitution or a similar string?)
 | 
			
		||||
    assert list(left)[0] == list(right)[0]
 | 
			
		||||
 | 
			
		||||
def test_score_hint():
 | 
			
		||||
    assert score_hint(0.5, 23) == 12  # int(ceil())
 | 
			
		||||
    assert score_hint(math.inf, 12345) is None
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue