diff --git a/README.md b/README.md index 8a50cc3..8490d25 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,15 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] The metrics can be chosen via a comma separated combination of their acronyms like "--metrics=ca,wer,boc,bow". + The html report can be enabled/disabled using --html/--no-html. + By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. Options: --metrics Enable different metrics like ca|cer, wa|wer, boc and bow. --textequiv-level LEVEL PAGE TextEquiv level to extract text from + --html / --no-html Enabling/disabling html report. --progress Show progress bar --help Show this message and exit. ~~~ @@ -85,10 +88,11 @@ The OCR-D processor has these parameters: | ------------------------- | ------------------------------------------------------------------- | | `-P metrics cer,wer` | Enable character error rate and word error rate (default) | | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) | +| `-P html false` | Enabling/disabling html report (default: enabled). | For example: ~~~ -ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer +ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false ~~~ Developer information diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 7b95c99..c360c4e 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -144,7 +144,9 @@ def generate_json_report(gt, ocr, report_prefix, metrics_results): json.dump(json_dict, fp) -def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"): +def process( + gt, ocr, report_prefix, *, html=True, metrics="cer,wer", textequiv_level="region" +): """Check OCR result against GT. The @click decorators change the signature of the decorated functions, @@ -172,13 +174,15 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio metrics_results[result.metric] = result generate_json_report(gt, ocr, report_prefix, metrics_results) - generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results) + if html: + generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results) @click.command() @click.argument("gt", type=click.Path(exists=True)) @click.argument("ocr", type=click.Path(exists=True)) @click.argument("report_prefix", type=click.Path(), default="report") +@click.option("--html", default=True, is_flag=True, help="Enable/disable html report.") @click.option( "--metrics", default="cer,wer", @@ -191,7 +195,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio metavar="LEVEL", ) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") -def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): +def main(gt, ocr, report_prefix, html, metrics, textequiv_level, progress): """ Compare the PAGE/ALTO/text document GT against the document OCR. @@ -210,12 +214,21 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): The metrics can be chosen via a comma separated combination of their acronyms like "--metrics=ca,wer,boc,bow". + The html report can be enabled/disabled using --html / --no-html. + By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() Config.progress = progress - process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) + process( + gt, + ocr, + report_prefix, + html=html, + metrics=metrics, + textequiv_level=textequiv_level, + ) if __name__ == "__main__": diff --git a/qurator/dinglehopper/ocrd-tool.json b/qurator/dinglehopper/ocrd-tool.json index 0537db8..9119d0d 100644 --- a/qurator/dinglehopper/ocrd-tool.json +++ b/qurator/dinglehopper/ocrd-tool.json @@ -18,6 +18,11 @@ "recognition/text-recognition" ], "parameters": { + "html": { + "type": "boolean", + "default": true, + "description": "Enable/disable html report." + }, "metrics": { "type": "string", "enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"], @@ -28,7 +33,7 @@ "type": "string", "enum": ["region", "line"], "default": "region", - "description": "PAGE XML hierarchy level to extract the text from" + "description": "PAGE XML hierarchy level to extract the text from." } } } diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index c7aea19..1040fe9 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -30,6 +30,7 @@ class OcrdDinglehopperEvaluate(Processor): log = getLogger("processor.OcrdDinglehopperEvaluate") + html = self.parameter["html"] metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] gt_grp, ocr_grp = self.input_file_grp.split(",") @@ -57,15 +58,16 @@ class OcrdDinglehopperEvaluate(Processor): gt_file.local_filename, ocr_file.local_filename, report_prefix, + html=html, metrics=metrics, textequiv_level=textequiv_level, ) # Add reports to the workspace - for report_suffix, mimetype in [ - [".html", "text/html"], - [".json", "application/json"], - ]: + report_types = [(".json", "application/json")] + if html: + report_types.append((".html", "text/html")) + for report_suffix, mimetype in report_types: self.workspace.add_file( ID=file_id + report_suffix, file_grp=self.output_file_grp, diff --git a/qurator/dinglehopper/tests/test_integ_cli.py b/qurator/dinglehopper/tests/test_integ_cli.py index 1769736..667da73 100644 --- a/qurator/dinglehopper/tests/test_integ_cli.py +++ b/qurator/dinglehopper/tests/test_integ_cli.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from .util import working_directory @@ -16,3 +18,19 @@ def test_cli_unknown_metric(tmp_path): with pytest.raises(ValueError, match="Unknown metric 'unknown'."): process("gt.txt", "ocr.txt", "report", metrics="cer,unknown") + + +@pytest.mark.integration +def test_cli_html_report_parameter(tmp_path): + """Test that html report can get turned off.""" + + with working_directory(str(tmp_path)): + with open("gt.txt", "w") as gtf: + gtf.write("") + with open("ocr.txt", "w") as ocrf: + ocrf.write("") + + process("gt.txt", "ocr.txt", "report", html=False) + + assert Path("report").with_suffix(".json").exists() + assert not Path("report").with_suffix(".html").exists() diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 4163d82..df14341 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,5 +1,6 @@ import json from itertools import combinations +from pathlib import Path import pytest @@ -45,11 +46,10 @@ def test_cli_json(metrics, tmp_path): with open("ocr.txt", "w") as ocrf: ocrf.write("AAAAB") - with open("gt.txt", "r") as gtf: - print(gtf.read()) process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics)) - with open("report.json", "r") as jsonf: - print(jsonf.read()) + + for suffix in (".json", ".html"): + assert Path("report").with_suffix(suffix).exists() with open("report.json", "r") as jsonf: j = json.load(jsonf) @@ -76,6 +76,10 @@ def test_cli_json_extremes(gt, ocr, err, tmp_path): ocrf.write(ocr) process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow") + + for suffix in (".json", ".html"): + assert Path("report").with_suffix(suffix).exists() + with open("report.json", "r") as jsonf: j = json.load(jsonf) for metric in set(METRIC_DICT.values()):