🚧 dinglehopper: Add word differences in line-dirs report

2026-02-24 18:32:12 +01:00 · 2021-12-14 18:20:04 +01:00 · 2021-12-14 18:20:04 +01:00 · cb2be96179
commit cb2be96179
parent dbb660615a
1 changed files with 7 additions and 8 deletions
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@ -43,6 +43,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
    cer = None
    n_characters = None
    char_diff_report = ""
    word_diff_report = ""
    for k, gt in enumerate(os.listdir(gt_dir)):
        # Find a match by replacing the suffix
@ -65,16 +66,14 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
        wer = 9999; n_words = 0
        char_diff_report += gen_diff_report(
-             gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
+            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
        )
-        # TODO
+        gt_words = words_normalized(gt_text)
-        #  gt_words = words_normalized(gt_text)
+        ocr_words = words_normalized(ocr_text)
-        #  ocr_words = words_normalized(ocr_text)
+        word_diff_report += gen_diff_report(
-        #  word_diff_report = gen_diff_report(
+            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
-        #      gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+        )
        #  )
        word_diff_report = "TODO"
    # XXX this is a copy from cli.py