✨ dinglehopper: Include number of characters and words in JSON report

2026-07-30 23:42:36 +02:00 · 2020-02-21 14:53:12 +01:00 · 2020-02-21 14:53:12 +01:00 · 779472575c
commit 779472575c
parent be251a391e
4 changed files with 41 additions and 18 deletions
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -1,21 +1,36 @@
 from __future__ import division
 import unicodedata
 from typing import Tuple
 from uniseg.graphemecluster import grapheme_clusters
 from qurator.dinglehopper.edit_distance import distance
-def character_error_rate(reference, compared):
+def character_error_rate_n(reference, compared) -> Tuple[float, int]:
    """
    Compute character error rate.
    :return: character error rate and length of the reference
    """
    d = distance(reference, compared)
    if d == 0:
        return 0
    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
    if n == 0:
        return float('inf')
-    return d/n
+    if d == 0:
        return 0, n
    if n == 0:
        return float('inf'), n
    return d/n, n
    # XXX Should we really count newlines here?
 def character_error_rate(reference, compared) -> float:
    """
    Compute character error rate.
    :return: character error rate
    """
    cer, _ = character_error_rate_n(reference, compared)
    return cer
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -57,8 +57,8 @@ def process(gt, ocr, report_prefix):
    gt_text = substitute_equivalences(gt_text)
    ocr_text = substitute_equivalences(ocr_text)
-    cer = character_error_rate(gt_text, ocr_text)
+    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    wer = word_error_rate(gt_text, ocr_text)
+    wer, n_words = word_error_rate_n(gt_text, ocr_text)
    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
@ -88,7 +88,8 @@ def process(gt, ocr, report_prefix):
        template = env.get_template(template_fn)
        template.stream(
            gt=gt, ocr=ocr,
-            cer=cer, wer=wer,
+            cer=cer, n_characters=n_characters,
            wer=wer, n_words=n_words,
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report
        ).dump(out_fn)
--- a/qurator/dinglehopper/templates/report.json.j2
+++ b/qurator/dinglehopper/templates/report.json.j2
@ -2,5 +2,7 @@
    "gt": "{{ gt }}",
    "ocr": "{{ ocr }}",
    "cer": {{ cer|json_float }},
-    "wer": {{ wer|json_float }}
+    "wer": {{ wer|json_float }},
    "n_characters": {{ n_characters }},
    "n_words": {{ n_words }}
 }
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -1,6 +1,7 @@
 from __future__ import division
 import unicodedata
 from typing import Tuple
 import uniseg.wordbreak
@ -44,7 +45,7 @@ def words_normalized(s):
    return words(unicodedata.normalize('NFC', s))
-def word_error_rate(reference, compared):
+def word_error_rate_n(reference, compared) -> Tuple[float, int]:
    if isinstance(reference, str):
        reference_seq = list(words_normalized(reference))
        compared_seq = list(words_normalized(compared))
@ -53,11 +54,15 @@ def word_error_rate(reference, compared):
        compared_seq = list(compared)
    d = levenshtein(reference_seq, compared_seq)
    if d == 0:
        return 0
    n = len(reference_seq)
    if n == 0:
        return float('inf')
-    return d / n
+    if d == 0:
        return 0, n
    if n == 0:
        return float('inf'), n
    return d / n, n
 def word_error_rate(reference, compared) -> float:
    wer, _ = word_error_rate_n(reference, compared)
    return wer