From 5b394649a7777f95932ab74c1e26743e8e180849 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 14 Dec 2021 18:33:20 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20dinglehopper:=20Compute=20WER=20?= =?UTF-8?q?in=20line-dirs=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/cli_line_dirs.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 5c877f2..48b86d2 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -43,6 +43,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): cer = None n_characters = None char_diff_report = "" + wer = None + n_words = None word_diff_report = "" for k, gt in enumerate(os.listdir(gt_dir)): @@ -62,13 +64,18 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_characters = n_characters + l_n_characters # Compute WER - # TODO wer, n_words = word_error_rate_n(gt_text, ocr_text) - wer = 9999; n_words = 0 + l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text) + if wer is None: + wer, n_words = l_wer, l_n_words + else: + # Rolling update + wer = (wer * n_words + l_wer * l_n_words) / (n_words + l_n_words) + n_words = n_words + l_n_words + # Generate diff reports char_diff_report += gen_diff_report( gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="ยท" ) - gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) word_diff_report += gen_diff_report(