diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 94dcee4..f3d1f84 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -44,12 +44,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_characters = None char_diff_report = "" - for gt in os.listdir(gt_dir): + for k, gt in enumerate(os.listdir(gt_dir)): # Find a match by replacing the suffix ocr = removesuffix(gt, gt_suffix) + ocr_suffix - gt_text = plain_extract(os.path.join(gt_dir, gt)) - ocr_text = plain_extract(os.path.join(ocr_dir, ocr)) + gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) + ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True) # Compute CER l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) @@ -65,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): wer = 9999; n_words = 0 char_diff_report += gen_diff_report( - gt_text, ocr_text, css_prefix="c", joiner="", none="·" + gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" ) # TODO diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 5271727..69f4df7 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -1,8 +1,9 @@ from __future__ import division, print_function +import os +import sys from typing import Iterator from warnings import warn -import sys from lxml import etree as ET from lxml.etree import XMLSyntaxError @@ -130,12 +131,15 @@ def page_text(tree, *, textequiv_level="region"): return page_extract(tree, textequiv_level=textequiv_level).text -def plain_extract(filename): +def plain_extract(filename, include_filename_in_id=False): + id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" with open(filename, "r") as f: return ExtractedText( None, [ - ExtractedText("line %d" % no, None, None, normalize_sbb(line)) + ExtractedText( + id_template.format(filename=os.path.basename(filename), no=no), + None, None, normalize_sbb(line)) for no, line in enumerate(f.readlines()) ], "\n",