From b93db45ca826ff5689fc3b979c1bdfd553203df0 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 13:59:55 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Support=20'merged'=20GT+OCR=20li?= =?UTF-8?q?ne=20directories?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 30b2be1..44305d6 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): +def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None): cer = None n_characters = None @@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_words = None word_diff_report = "" - for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)): + if gt_suffix is not None and ocr_suffix is not None: + gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) + else: + gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) + for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): gt_text = plain_extract(gt_fn, include_filename_in_id=True) ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) gt_words = words_normalized(gt_text) @@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): @click.option( "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" ) -def main(gt, ocr, report_prefix, metrics): +@click.option("--gt-suffix", help="Suffix of GT line text files") +@click.option("--ocr-suffix", help="Suffix of OCR line text files") +def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): """ Compare the GT line text directory against the OCR line text directory. This assumes that the GT line text directory contains textfiles with a common suffix like ".gt.txt", and the OCR line text directory contains textfiles with a common suffix like ".some-ocr.txt". The text files also need to be paired, - i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" - in the OCT lines directory. + i.e. the GT filename "line001.gt.txt" needs to match a filename + "line001.some-ocr.txt" in the OCR lines directory. - The GT and OCR directories are usually round truth line texts and the results of + GT and OCR directories may contain line text files in matching subdirectories, + e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt". + + GT and OCR directories can also be the same directory, but in this case you need + to give --gt-suffix and --ocr-suffix explicitly. + + The GT and OCR directories are usually ground truth line texts and the results of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. @@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics): """ initLogging() - process(gt, ocr, report_prefix, metrics=metrics) + process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix) if __name__ == "__main__":