diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index c8cb2ef..5e5b5a8 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -1,17 +1,18 @@ +import json import os import click from jinja2 import Environment, FileSystemLoader from markupsafe import escape -from uniseg.graphemecluster import grapheme_clusters from ocrd_utils import initLogging +from uniseg.graphemecluster import grapheme_clusters -from .metrics.character_error_rate import character_error_rate_n -from .metrics.word_error_rate import word_error_rate_n, words_normalized from .align import seq_align +from .config import Config from .extracted_text import ExtractedText +from .metrics.character_error_rate import character_error_rate_n +from .metrics.word_error_rate import word_error_rate_n, words_normalized from .ocr_files import extract -from .config import Config def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): @@ -84,19 +85,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): ) -def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): - """Check OCR result against GT. - - The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use - Click on a wrapper. - """ - - gt_text = extract(gt, textequiv_level=textequiv_level) - ocr_text = extract(ocr, textequiv_level=textequiv_level) - - cer, n_characters = character_error_rate_n(gt_text, ocr_text) - wer, n_words = word_error_rate_n(gt_text, ocr_text) - +def generate_html_report( + gt, ocr, gt_text, ocr_text, report_prefix, metrics, cer, n_characters, wer, n_words +): char_diff_report = gen_diff_report( gt_text, ocr_text, css_prefix="c", joiner="", none="·" ) @@ -107,41 +98,71 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" ) - def json_float(value): - """Convert a float value to an JSON float. - - This is here so that float('inf') yields "Infinity", not "inf". - """ - if value == float("inf"): - return "Infinity" - elif value == float("-inf"): - return "-Infinity" - else: - return str(value) - env = Environment( loader=FileSystemLoader( os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates") ) ) - env.filters["json_float"] = json_float - - for report_suffix in (".html", ".json"): - template_fn = "report" + report_suffix + ".j2" - out_fn = report_prefix + report_suffix - - template = env.get_template(template_fn) - template.stream( - gt=gt, - ocr=ocr, - cer=cer, - n_characters=n_characters, - wer=wer, - n_words=n_words, - char_diff_report=char_diff_report, - word_diff_report=word_diff_report, - metrics=metrics, - ).dump(out_fn) + + report_suffix = ".html" + template_fn = "report" + report_suffix + ".j2" + out_fn = report_prefix + report_suffix + + template = env.get_template(template_fn) + template.stream( + gt=gt, + ocr=ocr, + cer=cer, + n_characters=n_characters, + wer=wer, + n_words=n_words, + char_diff_report=char_diff_report, + word_diff_report=word_diff_report, + metrics=metrics, + ).dump(out_fn) + + +def generate_json_report( + gt, ocr, report_prefix, metrics, cer, n_characters, wer, n_words +): + json_dict = {"gt": gt, "ocr": ocr, "n_characters": n_characters, "n_words": n_words} + if metrics: + json_dict = {**json_dict, "cer": cer, "wer": wer} + with open(f"{report_prefix}.json", 'w') as fp: + json.dump(json_dict, fp) + + +def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): + """Check OCR result against GT. + + The @click decorators change the signature of the decorated functions, + so we keep this undecorated version and use Click on a wrapper. + """ + + gt_text = extract(gt, textequiv_level=textequiv_level) + ocr_text = extract(ocr, textequiv_level=textequiv_level) + + cer, n_characters = character_error_rate_n(gt_text, ocr_text) + wer, n_words = word_error_rate_n(gt_text, ocr_text) + + generate_json_report( + gt, ocr, report_prefix, metrics, cer, n_characters, wer, n_words + ) + + html_report = True + if html_report: + generate_html_report( + gt, + ocr, + gt_text, + ocr_text, + report_prefix, + metrics, + cer, + n_characters, + wer, + n_words, + ) @click.command() diff --git a/qurator/dinglehopper/templates/report.json.j2 b/qurator/dinglehopper/templates/report.json.j2 deleted file mode 100644 index 0e8af03..0000000 --- a/qurator/dinglehopper/templates/report.json.j2 +++ /dev/null @@ -1,10 +0,0 @@ -{ - "gt": "{{ gt }}", - "ocr": "{{ ocr }}", -{% if metrics %} - "cer": {{ cer|json_float }}, - "wer": {{ wer|json_float }}, -{% endif %} - "n_characters": {{ n_characters }}, - "n_words": {{ n_words }} -}