🎨 Reformat using Black

pull/90/head
Mike Gerber 1 year ago
parent d50d624554
commit bea56117ae

@ -70,7 +70,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
# support this, i.e. display for the one id produced # support this, i.e. display for the one id produced
if differences: if differences:
found_differences.append(f'{g} :: {o}') found_differences.append(f"{g} :: {o}")
gtx += joiner + format_thing(g, css_classes, gt_id) gtx += joiner + format_thing(g, css_classes, gt_id)
ocrx += joiner + format_thing(o, css_classes, ocr_id) ocrx += joiner + format_thing(o, css_classes, ocr_id)
@ -82,14 +82,17 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
found_differences = dict(Counter(elem for elem in found_differences)) found_differences = dict(Counter(elem for elem in found_differences))
return """ return (
"""
<div class="row"> <div class="row">
<div class="col-md-6 gt">{}</div> <div class="col-md-6 gt">{}</div>
<div class="col-md-6 ocr">{}</div> <div class="col-md-6 ocr">{}</div>
</div> </div>
""".format( """.format(
gtx, ocrx gtx, ocrx
), found_differences ),
found_differences,
)
def json_float(value): def json_float(value):
@ -105,8 +108,16 @@ def json_float(value):
return str(value) return str(value)
def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True, def process(
differences=False, textequiv_level="region"): gt,
ocr,
report_prefix,
reports_folder=".",
*,
metrics=True,
differences=False,
textequiv_level="region",
):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -119,15 +130,19 @@ def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text) wer, n_words = word_error_rate_n(gt_text, ocr_text)
char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c", char_diff_report, diff_c = gen_diff_report(
joiner="", gt_text, ocr_text, css_prefix="c", joiner="", none="·", differences=differences
none="·", differences=differences) )
gt_words = words_normalized(gt_text) gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text) ocr_words = words_normalized(ocr_text)
word_diff_report, diff_w = gen_diff_report( word_diff_report, diff_w = gen_diff_report(
gt_words, ocr_words, css_prefix="w", joiner=" ", none="", gt_words,
differences=differences ocr_words,
css_prefix="w",
joiner=" ",
none="",
differences=differences,
) )
env = Environment( env = Environment(
@ -162,19 +177,23 @@ def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
).dump(out_fn) ).dump(out_fn)
def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences, def process_dir(
textequiv_level): gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
):
for gt_file in os.listdir(gt): for gt_file in os.listdir(gt):
gt_file_path = os.path.join(gt, gt_file) gt_file_path = os.path.join(gt, gt_file)
ocr_file_path = os.path.join(ocr, gt_file) ocr_file_path = os.path.join(ocr, gt_file)
if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path): if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
process(gt_file_path, ocr_file_path, process(
f"{gt_file}-{report_prefix}", gt_file_path,
reports_folder=reports_folder, ocr_file_path,
metrics=metrics, f"{gt_file}-{report_prefix}",
differences=differences, reports_folder=reports_folder,
textequiv_level=textequiv_level) metrics=metrics,
differences=differences,
textequiv_level=textequiv_level,
)
else: else:
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@ -190,7 +209,7 @@ def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
@click.option( @click.option(
"--differences", "--differences",
default=False, default=False,
help="Enable reporting character and word level differences" help="Enable reporting character and word level differences",
) )
@click.option( @click.option(
"--textequiv-level", "--textequiv-level",
@ -199,8 +218,16 @@ def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
metavar="LEVEL", metavar="LEVEL",
) )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level, def main(
progress): gt,
ocr,
report_prefix,
reports_folder,
metrics,
differences,
textequiv_level,
progress,
):
""" """
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -228,11 +255,25 @@ def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv
"OCR must be a directory if GT is a directory", param_hint="ocr" "OCR must be a directory if GT is a directory", param_hint="ocr"
) )
else: else:
process_dir(gt, ocr, report_prefix, reports_folder, metrics, process_dir(
differences, textequiv_level) gt,
ocr,
report_prefix,
reports_folder,
metrics,
differences,
textequiv_level,
)
else: else:
process(gt, ocr, report_prefix, reports_folder, metrics=metrics, process(
differences=differences, textequiv_level=textequiv_level) gt,
ocr,
report_prefix,
reports_folder,
metrics=metrics,
differences=differences,
textequiv_level=textequiv_level,
)
if __name__ == "__main__": if __name__ == "__main__":

@ -1,4 +1,3 @@
import click import click
from ocrd_utils import initLogging from ocrd_utils import initLogging

@ -23,7 +23,8 @@ def process(reports_folder, occurrences_threshold=1):
if "cer" not in report_data or "wer" not in report_data: if "cer" not in report_data or "wer" not in report_data:
click.echo( click.echo(
f"Skipping {report} because it does not contain CER and WER") f"Skipping {report} because it does not contain CER and WER"
)
continue continue
cer = report_data["cer"] cer = report_data["cer"]
@ -60,7 +61,7 @@ def process(reports_folder, occurrences_threshold=1):
for report_suffix in (".html", ".json"): for report_suffix in (".html", ".json"):
template_fn = "summary" + report_suffix + ".j2" template_fn = "summary" + report_suffix + ".j2"
out_fn = os.path.join(reports_folder, 'summary' + report_suffix) out_fn = os.path.join(reports_folder, "summary" + report_suffix)
template = env.get_template(template_fn) template = env.get_template(template_fn)
template.stream( template.stream(
num_reports=len(cer_list), num_reports=len(cer_list),
@ -73,14 +74,13 @@ def process(reports_folder, occurrences_threshold=1):
@click.command() @click.command()
@click.argument("reports_folder", @click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
type=click.Path(exists=True), @click.option(
default="./reports" "--occurrences-threshold",
) type=int,
@click.option("--occurrences-threshold", default=1,
type=int, help="Only show differences that occur at least this many times.",
default=1, )
help="Only show differences that occur at least this many times.")
def main(reports_folder, occurrences_threshold): def main(reports_folder, occurrences_threshold):
""" """
Summarize the results from multiple reports generated earlier by dinglehopper. Summarize the results from multiple reports generated earlier by dinglehopper.

@ -16,10 +16,15 @@ def test_cli_directory(tmp_path):
""" """
initLogging() initLogging()
process_dir(os.path.join(data_dir, "directory-test", "gt"), process_dir(
os.path.join(data_dir, "directory-test", "ocr"), os.path.join(data_dir, "directory-test", "gt"),
"report", str(tmp_path / "reports"), False, True, os.path.join(data_dir, "directory-test", "ocr"),
"line") "report",
str(tmp_path / "reports"),
False,
True,
"line",
)
assert os.path.exists(tmp_path / "reports/1.xml-report.json") assert os.path.exists(tmp_path / "reports/1.xml-report.json")
assert os.path.exists(tmp_path / "reports/1.xml-report.html") assert os.path.exists(tmp_path / "reports/1.xml-report.html")
@ -35,9 +40,14 @@ def test_cli_fail_without_gt(tmp_path):
""" """
initLogging() initLogging()
process_dir(os.path.join(data_dir, "directory-test", "gt"), process_dir(
os.path.join(data_dir, "directory-test", "ocr"), os.path.join(data_dir, "directory-test", "gt"),
"report", str(tmp_path / "reports"), False, True, os.path.join(data_dir, "directory-test", "ocr"),
"line") "report",
str(tmp_path / "reports"),
False,
True,
"line",
)
assert len(os.listdir(tmp_path / "reports")) == 2 * 2 assert len(os.listdir(tmp_path / "reports")) == 2 * 2

@ -12,18 +12,26 @@ data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
def test_cli_differences(tmp_path): def test_cli_differences(tmp_path):
"""Test that the cli/process() yields a JSON report that includes """Test that the cli/process() yields a JSON report that includes
the differences found between the GT and OCR text""" the differences found between the GT and OCR text"""
initLogging() initLogging()
process(os.path.join(data_dir, "test-gt.page2018.xml"), process(
os.path.join(data_dir, "test-fake-ocr.page2018.xml"), os.path.join(data_dir, "test-gt.page2018.xml"),
"report", tmp_path, differences=True) os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
"report",
tmp_path,
differences=True,
)
assert os.path.exists(tmp_path / "report.json") assert os.path.exists(tmp_path / "report.json")
with open(tmp_path / "report.json", "r") as jsonf: with open(tmp_path / "report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1}, assert j["differences"] == {
"word_level": {'Augenblick :: Augemblick': 1, "character_level": {"n :: m": 1, "ſ :: f": 1},
'Verſprochene :: Verfprochene': 1}} "word_level": {
"Augenblick :: Augemblick": 1,
"Verſprochene :: Verfprochene": 1,
},
}

@ -18,16 +18,22 @@ def create_summaries(tmp_path):
reports_dirname = tmp_path / "reports" reports_dirname = tmp_path / "reports"
reports_dirname.mkdir() reports_dirname.mkdir()
report1 = {"cer": 0.05, "wer": 0.15, report1 = {
"differences": { "cer": 0.05,
"character_level": {"a": 10, "b": 20}, "wer": 0.15,
"word_level": {"c": 30, "d": 40} "differences": {
}} "character_level": {"a": 10, "b": 20},
report2 = {"cer": 0.10, "wer": 0.20, "word_level": {"c": 30, "d": 40},
"differences": { },
"character_level": {"a": 20, "b": 30}, }
"word_level": {"c": 40, "d": 50} report2 = {
}} "cer": 0.10,
"wer": 0.20,
"differences": {
"character_level": {"a": 20, "b": 30},
"word_level": {"c": 40, "d": 50},
},
}
with open(os.path.join(reports_dirname, "report1.json"), "w") as f: with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
json.dump(report1, f) json.dump(report1, f)
@ -47,7 +53,6 @@ def test_cli_summarize_json(tmp_path, create_summaries):
with open(os.path.join(reports_dirname, "summary.json"), "r") as f: with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
summary_data = json.load(f) summary_data = json.load(f)
assert summary_data["num_reports"] == 2 assert summary_data["num_reports"] == 2
assert summary_data["cer_avg"] == expected_cer_avg assert summary_data["cer_avg"] == expected_cer_avg
assert summary_data["wer_avg"] == expected_wer_avg assert summary_data["wer_avg"] == expected_wer_avg
@ -83,11 +88,13 @@ def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
reports_dirname = create_summaries reports_dirname = create_summaries
# This third report has no WER value and should not be included in the summary # This third report has no WER value and should not be included in the summary
report3 = {"cer": 0.10, report3 = {
"differences": { "cer": 0.10,
"character_level": {"a": 20, "b": 30}, "differences": {
"word_level": {"c": 40, "d": 50} "character_level": {"a": 20, "b": 30},
}} "word_level": {"c": 40, "d": 50},
},
}
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f: with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
json.dump(report3, f) json.dump(report3, f)

Loading…
Cancel
Save