1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-10-25 23:44:13 +02:00

Allow disabling the html report.

This commit is contained in:
Benjamin Rosemann 2021-06-14 16:25:31 +02:00
parent e8ccffb275
commit 7642a53091
6 changed files with 60 additions and 14 deletions

View file

@ -46,12 +46,15 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
The metrics can be chosen via a comma separated combination of their acronyms The metrics can be chosen via a comma separated combination of their acronyms
like "--metrics=ca,wer,boc,bow". like "--metrics=ca,wer,boc,bow".
The html report can be enabled/disabled using --html/--no-html.
By default, the text of PAGE files is extracted on 'region' level. You may By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags. use "--textequiv-level line" to extract from the level of TextLine tags.
Options: Options:
--metrics Enable different metrics like ca|cer, wa|wer, boc and bow. --metrics Enable different metrics like ca|cer, wa|wer, boc and bow.
--textequiv-level LEVEL PAGE TextEquiv level to extract text from --textequiv-level LEVEL PAGE TextEquiv level to extract text from
--html / --no-html Enabling/disabling html report.
--progress Show progress bar --progress Show progress bar
--help Show this message and exit. --help Show this message and exit.
~~~ ~~~
@ -85,10 +88,11 @@ The OCR-D processor has these parameters:
| ------------------------- | ------------------------------------------------------------------- | | ------------------------- | ------------------------------------------------------------------- |
| `-P metrics cer,wer` | Enable character error rate and word error rate (default) | | `-P metrics cer,wer` | Enable character error rate and word error rate (default) |
| `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) | | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
| `-P html false` | Enabling/disabling html report (default: enabled). |
For example: For example:
~~~ ~~~
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false
~~~ ~~~
Developer information Developer information

View file

@ -144,7 +144,9 @@ def generate_json_report(gt, ocr, report_prefix, metrics_results):
json.dump(json_dict, fp) json.dump(json_dict, fp)
def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"): def process(
gt, ocr, report_prefix, *, html=True, metrics="cer,wer", textequiv_level="region"
):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, The @click decorators change the signature of the decorated functions,
@ -172,13 +174,15 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
metrics_results[result.metric] = result metrics_results[result.metric] = result
generate_json_report(gt, ocr, report_prefix, metrics_results) generate_json_report(gt, ocr, report_prefix, metrics_results)
generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results) if html:
generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results)
@click.command() @click.command()
@click.argument("gt", type=click.Path(exists=True)) @click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True)) @click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report") @click.argument("report_prefix", type=click.Path(), default="report")
@click.option("--html", default=True, is_flag=True, help="Enable/disable html report.")
@click.option( @click.option(
"--metrics", "--metrics",
default="cer,wer", default="cer,wer",
@ -191,7 +195,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
metavar="LEVEL", metavar="LEVEL",
) )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): def main(gt, ocr, report_prefix, html, metrics, textequiv_level, progress):
""" """
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -210,12 +214,21 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
The metrics can be chosen via a comma separated combination of their acronyms The metrics can be chosen via a comma separated combination of their acronyms
like "--metrics=ca,wer,boc,bow". like "--metrics=ca,wer,boc,bow".
The html report can be enabled/disabled using --html / --no-html.
By default, the text of PAGE files is extracted on 'region' level. You may By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags. use "--textequiv-level line" to extract from the level of TextLine tags.
""" """
initLogging() initLogging()
Config.progress = progress Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) process(
gt,
ocr,
report_prefix,
html=html,
metrics=metrics,
textequiv_level=textequiv_level,
)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -18,6 +18,11 @@
"recognition/text-recognition" "recognition/text-recognition"
], ],
"parameters": { "parameters": {
"html": {
"type": "boolean",
"default": true,
"description": "Enable/disable html report."
},
"metrics": { "metrics": {
"type": "string", "type": "string",
"enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"], "enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"],
@ -28,7 +33,7 @@
"type": "string", "type": "string",
"enum": ["region", "line"], "enum": ["region", "line"],
"default": "region", "default": "region",
"description": "PAGE XML hierarchy level to extract the text from" "description": "PAGE XML hierarchy level to extract the text from."
} }
} }
} }

View file

@ -30,6 +30,7 @@ class OcrdDinglehopperEvaluate(Processor):
log = getLogger("processor.OcrdDinglehopperEvaluate") log = getLogger("processor.OcrdDinglehopperEvaluate")
html = self.parameter["html"]
metrics = self.parameter["metrics"] metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"] textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",") gt_grp, ocr_grp = self.input_file_grp.split(",")
@ -57,15 +58,16 @@ class OcrdDinglehopperEvaluate(Processor):
gt_file.local_filename, gt_file.local_filename,
ocr_file.local_filename, ocr_file.local_filename,
report_prefix, report_prefix,
html=html,
metrics=metrics, metrics=metrics,
textequiv_level=textequiv_level, textequiv_level=textequiv_level,
) )
# Add reports to the workspace # Add reports to the workspace
for report_suffix, mimetype in [ report_types = [(".json", "application/json")]
[".html", "text/html"], if html:
[".json", "application/json"], report_types.append((".html", "text/html"))
]: for report_suffix, mimetype in report_types:
self.workspace.add_file( self.workspace.add_file(
ID=file_id + report_suffix, ID=file_id + report_suffix,
file_grp=self.output_file_grp, file_grp=self.output_file_grp,

View file

@ -1,3 +1,5 @@
from pathlib import Path
import pytest import pytest
from .util import working_directory from .util import working_directory
@ -16,3 +18,19 @@ def test_cli_unknown_metric(tmp_path):
with pytest.raises(ValueError, match="Unknown metric 'unknown'."): with pytest.raises(ValueError, match="Unknown metric 'unknown'."):
process("gt.txt", "ocr.txt", "report", metrics="cer,unknown") process("gt.txt", "ocr.txt", "report", metrics="cer,unknown")
@pytest.mark.integration
def test_cli_html_report_parameter(tmp_path):
"""Test that html report can get turned off."""
with working_directory(str(tmp_path)):
with open("gt.txt", "w") as gtf:
gtf.write("")
with open("ocr.txt", "w") as ocrf:
ocrf.write("")
process("gt.txt", "ocr.txt", "report", html=False)
assert Path("report").with_suffix(".json").exists()
assert not Path("report").with_suffix(".html").exists()

View file

@ -1,5 +1,6 @@
import json import json
from itertools import combinations from itertools import combinations
from pathlib import Path
import pytest import pytest
@ -45,11 +46,10 @@ def test_cli_json(metrics, tmp_path):
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB") ocrf.write("AAAAB")
with open("gt.txt", "r") as gtf:
print(gtf.read())
process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics)) process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics))
with open("report.json", "r") as jsonf:
print(jsonf.read()) for suffix in (".json", ".html"):
assert Path("report").with_suffix(suffix).exists()
with open("report.json", "r") as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
@ -76,6 +76,10 @@ def test_cli_json_extremes(gt, ocr, err, tmp_path):
ocrf.write(ocr) ocrf.write(ocr)
process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow") process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow")
for suffix in (".json", ".html"):
assert Path("report").with_suffix(suffix).exists()
with open("report.json", "r") as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
for metric in set(METRIC_DICT.values()): for metric in set(METRIC_DICT.values()):