diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index 7a8f484..82ee3ce 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -70,7 +70,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False): # support this, i.e. display for the one id produced if differences: - found_differences.append(f'{g} :: {o}') + found_differences.append(f"{g} :: {o}") gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) @@ -82,14 +82,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False): found_differences = dict(Counter(elem for elem in found_differences)) - return """ + return ( + """
{}
{}
""".format( - gtx, ocrx - ), found_differences + gtx, ocrx + ), + found_differences, + ) def json_float(value): @@ -105,8 +108,16 @@ def json_float(value): return str(value) -def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True, - differences=False, textequiv_level="region"): +def process( + gt, + ocr, + report_prefix, + reports_folder=".", + *, + metrics=True, + differences=False, + textequiv_level="region", +): """Check OCR result against GT. The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use @@ -119,15 +130,19 @@ def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True, cer, n_characters = character_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text) - char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c", - joiner="", - none="·", differences=differences) + char_diff_report, diff_c = gen_diff_report( + gt_text, ocr_text, css_prefix="c", joiner="", none="·", differences=differences + ) gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) word_diff_report, diff_w = gen_diff_report( - gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯", - differences=differences + gt_words, + ocr_words, + css_prefix="w", + joiner=" ", + none="⋯", + differences=differences, ) env = Environment( @@ -162,19 +177,23 @@ def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True, ).dump(out_fn) -def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences, - textequiv_level): +def process_dir( + gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level +): for gt_file in os.listdir(gt): gt_file_path = os.path.join(gt, gt_file) ocr_file_path = os.path.join(ocr, gt_file) if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path): - process(gt_file_path, ocr_file_path, - f"{gt_file}-{report_prefix}", - reports_folder=reports_folder, - metrics=metrics, - differences=differences, - textequiv_level=textequiv_level) + process( + gt_file_path, + ocr_file_path, + f"{gt_file}-{report_prefix}", + reports_folder=reports_folder, + metrics=metrics, + differences=differences, + textequiv_level=textequiv_level, + ) else: print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) @@ -190,7 +209,7 @@ def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences, @click.option( "--differences", default=False, - help="Enable reporting character and word level differences" + help="Enable reporting character and word level differences", ) @click.option( "--textequiv-level", @@ -199,8 +218,16 @@ def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences, metavar="LEVEL", ) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") -def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level, - progress): +def main( + gt, + ocr, + report_prefix, + reports_folder, + metrics, + differences, + textequiv_level, + progress, +): """ Compare the PAGE/ALTO/text document GT against the document OCR. @@ -228,11 +255,25 @@ def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv "OCR must be a directory if GT is a directory", param_hint="ocr" ) else: - process_dir(gt, ocr, report_prefix, reports_folder, metrics, - differences, textequiv_level) + process_dir( + gt, + ocr, + report_prefix, + reports_folder, + metrics, + differences, + textequiv_level, + ) else: - process(gt, ocr, report_prefix, reports_folder, metrics=metrics, - differences=differences, textequiv_level=textequiv_level) + process( + gt, + ocr, + report_prefix, + reports_folder, + metrics=metrics, + differences=differences, + textequiv_level=textequiv_level, + ) if __name__ == "__main__": diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py index 3c35b1b..9c51d34 100644 --- a/src/dinglehopper/cli_extract.py +++ b/src/dinglehopper/cli_extract.py @@ -1,4 +1,3 @@ - import click from ocrd_utils import initLogging diff --git a/src/dinglehopper/cli_summarize.py b/src/dinglehopper/cli_summarize.py index 3262371..0422759 100644 --- a/src/dinglehopper/cli_summarize.py +++ b/src/dinglehopper/cli_summarize.py @@ -23,7 +23,8 @@ def process(reports_folder, occurrences_threshold=1): if "cer" not in report_data or "wer" not in report_data: click.echo( - f"Skipping {report} because it does not contain CER and WER") + f"Skipping {report} because it does not contain CER and WER" + ) continue cer = report_data["cer"] @@ -60,7 +61,7 @@ def process(reports_folder, occurrences_threshold=1): for report_suffix in (".html", ".json"): template_fn = "summary" + report_suffix + ".j2" - out_fn = os.path.join(reports_folder, 'summary' + report_suffix) + out_fn = os.path.join(reports_folder, "summary" + report_suffix) template = env.get_template(template_fn) template.stream( num_reports=len(cer_list), @@ -73,14 +74,13 @@ def process(reports_folder, occurrences_threshold=1): @click.command() -@click.argument("reports_folder", - type=click.Path(exists=True), - default="./reports" - ) -@click.option("--occurrences-threshold", - type=int, - default=1, - help="Only show differences that occur at least this many times.") +@click.argument("reports_folder", type=click.Path(exists=True), default="./reports") +@click.option( + "--occurrences-threshold", + type=int, + default=1, + help="Only show differences that occur at least this many times.", +) def main(reports_folder, occurrences_threshold): """ Summarize the results from multiple reports generated earlier by dinglehopper. diff --git a/src/dinglehopper/tests/test_integ_cli_dir.py b/src/dinglehopper/tests/test_integ_cli_dir.py index 7f3196c..c065130 100644 --- a/src/dinglehopper/tests/test_integ_cli_dir.py +++ b/src/dinglehopper/tests/test_integ_cli_dir.py @@ -16,10 +16,15 @@ def test_cli_directory(tmp_path): """ initLogging() - process_dir(os.path.join(data_dir, "directory-test", "gt"), - os.path.join(data_dir, "directory-test", "ocr"), - "report", str(tmp_path / "reports"), False, True, - "line") + process_dir( + os.path.join(data_dir, "directory-test", "gt"), + os.path.join(data_dir, "directory-test", "ocr"), + "report", + str(tmp_path / "reports"), + False, + True, + "line", + ) assert os.path.exists(tmp_path / "reports/1.xml-report.json") assert os.path.exists(tmp_path / "reports/1.xml-report.html") @@ -35,9 +40,14 @@ def test_cli_fail_without_gt(tmp_path): """ initLogging() - process_dir(os.path.join(data_dir, "directory-test", "gt"), - os.path.join(data_dir, "directory-test", "ocr"), - "report", str(tmp_path / "reports"), False, True, - "line") + process_dir( + os.path.join(data_dir, "directory-test", "gt"), + os.path.join(data_dir, "directory-test", "ocr"), + "report", + str(tmp_path / "reports"), + False, + True, + "line", + ) assert len(os.listdir(tmp_path / "reports")) == 2 * 2 diff --git a/src/dinglehopper/tests/test_integ_differences.py b/src/dinglehopper/tests/test_integ_differences.py index 19cb9d1..452e085 100644 --- a/src/dinglehopper/tests/test_integ_differences.py +++ b/src/dinglehopper/tests/test_integ_differences.py @@ -12,18 +12,26 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") @pytest.mark.integration def test_cli_differences(tmp_path): """Test that the cli/process() yields a JSON report that includes - the differences found between the GT and OCR text""" + the differences found between the GT and OCR text""" initLogging() - process(os.path.join(data_dir, "test-gt.page2018.xml"), - os.path.join(data_dir, "test-fake-ocr.page2018.xml"), - "report", tmp_path, differences=True) + process( + os.path.join(data_dir, "test-gt.page2018.xml"), + os.path.join(data_dir, "test-fake-ocr.page2018.xml"), + "report", + tmp_path, + differences=True, + ) assert os.path.exists(tmp_path / "report.json") with open(tmp_path / "report.json", "r") as jsonf: j = json.load(jsonf) - assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1}, - "word_level": {'Augenblick :: Augemblick': 1, - 'Verſprochene :: Verfprochene': 1}} + assert j["differences"] == { + "character_level": {"n :: m": 1, "ſ :: f": 1}, + "word_level": { + "Augenblick :: Augemblick": 1, + "Verſprochene :: Verfprochene": 1, + }, + } diff --git a/src/dinglehopper/tests/test_integ_summarize.py b/src/dinglehopper/tests/test_integ_summarize.py index d4a4900..7ea8f70 100644 --- a/src/dinglehopper/tests/test_integ_summarize.py +++ b/src/dinglehopper/tests/test_integ_summarize.py @@ -18,16 +18,22 @@ def create_summaries(tmp_path): reports_dirname = tmp_path / "reports" reports_dirname.mkdir() - report1 = {"cer": 0.05, "wer": 0.15, - "differences": { - "character_level": {"a": 10, "b": 20}, - "word_level": {"c": 30, "d": 40} - }} - report2 = {"cer": 0.10, "wer": 0.20, - "differences": { - "character_level": {"a": 20, "b": 30}, - "word_level": {"c": 40, "d": 50} - }} + report1 = { + "cer": 0.05, + "wer": 0.15, + "differences": { + "character_level": {"a": 10, "b": 20}, + "word_level": {"c": 30, "d": 40}, + }, + } + report2 = { + "cer": 0.10, + "wer": 0.20, + "differences": { + "character_level": {"a": 20, "b": 30}, + "word_level": {"c": 40, "d": 50}, + }, + } with open(os.path.join(reports_dirname, "report1.json"), "w") as f: json.dump(report1, f) @@ -47,7 +53,6 @@ def test_cli_summarize_json(tmp_path, create_summaries): with open(os.path.join(reports_dirname, "summary.json"), "r") as f: summary_data = json.load(f) - assert summary_data["num_reports"] == 2 assert summary_data["cer_avg"] == expected_cer_avg assert summary_data["wer_avg"] == expected_wer_avg @@ -83,11 +88,13 @@ def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries): reports_dirname = create_summaries # This third report has no WER value and should not be included in the summary - report3 = {"cer": 0.10, - "differences": { - "character_level": {"a": 20, "b": 30}, - "word_level": {"c": 40, "d": 50} - }} + report3 = { + "cer": 0.10, + "differences": { + "character_level": {"a": 20, "b": 30}, + "word_level": {"c": 40, "d": 50}, + }, + } with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f: json.dump(report3, f)