Allow disabling the html report.

2026-07-29 15:02:33 +02:00 · 2021-06-14 16:25:31 +02:00 · 2021-06-14 16:25:31 +02:00 · 7642a53091
commit 7642a53091
parent e8ccffb275
6 changed files with 60 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -46,12 +46,15 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  The metrics can be chosen via a comma separated combination of their acronyms
  like "--metrics=ca,wer,boc,bow".
  The html report can be enabled/disabled using --html/--no-html.
  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.
 Options:
  --metrics                 Enable different metrics like ca|cer, wa|wer, boc and bow.
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --html / --no-html        Enabling/disabling html report.
  --progress                Show progress bar
  --help                    Show this message and exit.
 ~~~
@ -85,10 +88,11 @@ The OCR-D processor has these parameters:
 | ------------------------- | ------------------------------------------------------------------- |
 | `-P metrics cer,wer`      | Enable character error rate and word error rate (default)           |
 | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
 | `-P html false`           | Enabling/disabling html report (default: enabled).                  |
 For example:
 ~~~
-ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer
+ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false
 ~~~
 Developer information
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -144,7 +144,9 @@ def generate_json_report(gt, ocr, report_prefix, metrics_results):
        json.dump(json_dict, fp)
-def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"):
+def process(
    gt, ocr, report_prefix, *, html=True, metrics="cer,wer", textequiv_level="region"
 ):
    """Check OCR result against GT.
    The @click decorators change the signature of the decorated functions,
@ -172,6 +174,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
            metrics_results[result.metric] = result
    generate_json_report(gt, ocr, report_prefix, metrics_results)
    if html:
        generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results)
@ -179,6 +182,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.option("--html", default=True, is_flag=True, help="Enable/disable html report.")
@click.option(
    "--metrics",
    default="cer,wer",
@ -191,7 +195,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
    metavar="LEVEL",
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(gt, ocr, report_prefix, html, metrics, textequiv_level, progress):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.
@ -210,12 +214,21 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    The metrics can be chosen via a comma separated combination of their acronyms
    like "--metrics=ca,wer,boc,bow".
    The html report can be enabled/disabled using --html / --no-html.
    By default, the text of PAGE files is extracted on 'region' level. You may
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    initLogging()
    Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    process(
        gt,
        ocr,
        report_prefix,
        html=html,
        metrics=metrics,
        textequiv_level=textequiv_level,
    )
 if __name__ == "__main__":
--- a/qurator/dinglehopper/ocrd-tool.json
+++ b/qurator/dinglehopper/ocrd-tool.json
@ -18,6 +18,11 @@
        "recognition/text-recognition"
      ],
      "parameters": {
        "html": {
          "type": "boolean",
          "default": true,
          "description": "Enable/disable html report."
        },
        "metrics": {
          "type": "string",
          "enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"],
@ -28,7 +33,7 @@
          "type": "string",
          "enum": ["region", "line"],
          "default": "region",
-          "description": "PAGE XML hierarchy level to extract the text from"
+          "description": "PAGE XML hierarchy level to extract the text from."
        }
      }
    }
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -30,6 +30,7 @@ class OcrdDinglehopperEvaluate(Processor):
        log = getLogger("processor.OcrdDinglehopperEvaluate")
        html = self.parameter["html"]
        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
        gt_grp, ocr_grp = self.input_file_grp.split(",")
@ -57,15 +58,16 @@ class OcrdDinglehopperEvaluate(Processor):
                gt_file.local_filename,
                ocr_file.local_filename,
                report_prefix,
                html=html,
                metrics=metrics,
                textequiv_level=textequiv_level,
            )
            # Add reports to the workspace
-            for report_suffix, mimetype in [
+            report_types = [(".json", "application/json")]
-                [".html", "text/html"],
+            if html:
-                [".json", "application/json"],
+                report_types.append((".html", "text/html"))
-            ]:
+            for report_suffix, mimetype in report_types:
                self.workspace.add_file(
                    ID=file_id + report_suffix,
                    file_grp=self.output_file_grp,
--- a/qurator/dinglehopper/tests/test_integ_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_cli.py
@ -1,3 +1,5 @@
 from pathlib import Path
 import pytest
 from .util import working_directory
@ -16,3 +18,19 @@ def test_cli_unknown_metric(tmp_path):
        with pytest.raises(ValueError, match="Unknown metric 'unknown'."):
            process("gt.txt", "ocr.txt", "report", metrics="cer,unknown")
@pytest.mark.integration
 def test_cli_html_report_parameter(tmp_path):
    """Test that html report can get turned off."""
    with working_directory(str(tmp_path)):
        with open("gt.txt", "w") as gtf:
            gtf.write("")
        with open("ocr.txt", "w") as ocrf:
            ocrf.write("")
        process("gt.txt", "ocr.txt", "report", html=False)
        assert Path("report").with_suffix(".json").exists()
        assert not Path("report").with_suffix(".html").exists()
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -1,5 +1,6 @@
 import json
 from itertools import combinations
 from pathlib import Path
 import pytest
@ -45,11 +46,10 @@ def test_cli_json(metrics, tmp_path):
        with open("ocr.txt", "w") as ocrf:
            ocrf.write("AAAAB")
        with open("gt.txt", "r") as gtf:
            print(gtf.read())
        process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics))
-        with open("report.json", "r") as jsonf:
+
-            print(jsonf.read())
+        for suffix in (".json", ".html"):
            assert Path("report").with_suffix(suffix).exists()
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
@ -76,6 +76,10 @@ def test_cli_json_extremes(gt, ocr, err, tmp_path):
            ocrf.write(ocr)
        process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow")
        for suffix in (".json", ".html"):
            assert Path("report").with_suffix(suffix).exists()
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            for metric in set(METRIC_DICT.values()):