🚧 dinglehopper: Compare line text directories (WIP)

2025-11-03 02:44:14 +01:00 · 2021-12-13 20:02:18 +01:00 · 2021-12-13 20:02:18 +01:00 · dbb660615a
commit dbb660615a
parent a018006f98
2 changed files with 11 additions and 7 deletions
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
    n_characters = None
    char_diff_report = ""
-    for gt in os.listdir(gt_dir):
+    for k, gt in enumerate(os.listdir(gt_dir)):
        # Find a match by replacing the suffix
        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
-        gt_text = plain_extract(os.path.join(gt_dir, gt))
+        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
-        ocr_text = plain_extract(os.path.join(ocr_dir, ocr))
+        ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
        # Compute CER
        l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
        wer = 9999; n_words = 0
        char_diff_report += gen_diff_report(
-             gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+             gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
        )
        # TODO
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -1,8 +1,9 @@
 from __future__ import division, print_function
 import os
 import sys
 from typing import Iterator
 from warnings import warn
 import sys
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"):
    return page_extract(tree, textequiv_level=textequiv_level).text
-def plain_extract(filename):
+def plain_extract(filename, include_filename_in_id=False):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
    with open(filename, "r") as f:
        return ExtractedText(
            None,
            [
-                ExtractedText("line %d" % no, None, None, normalize_sbb(line))
+                ExtractedText(
                    id_template.format(filename=os.path.basename(filename), no=no),
                    None, None, normalize_sbb(line))
                for no, line in enumerate(f.readlines())
            ],
            "\n",