mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
Allow disabling the html report.
This commit is contained in:
parent
e8ccffb275
commit
7642a53091
6 changed files with 60 additions and 14 deletions
|
@ -46,12 +46,15 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
|||
The metrics can be chosen via a comma separated combination of their acronyms
|
||||
like "--metrics=ca,wer,boc,bow".
|
||||
|
||||
The html report can be enabled/disabled using --html/--no-html.
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
|
||||
Options:
|
||||
--metrics Enable different metrics like ca|cer, wa|wer, boc and bow.
|
||||
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
|
||||
--html / --no-html Enabling/disabling html report.
|
||||
--progress Show progress bar
|
||||
--help Show this message and exit.
|
||||
~~~
|
||||
|
@ -85,10 +88,11 @@ The OCR-D processor has these parameters:
|
|||
| ------------------------- | ------------------------------------------------------------------- |
|
||||
| `-P metrics cer,wer` | Enable character error rate and word error rate (default) |
|
||||
| `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
|
||||
| `-P html false` | Enabling/disabling html report (default: enabled). |
|
||||
|
||||
For example:
|
||||
~~~
|
||||
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer
|
||||
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false
|
||||
~~~
|
||||
|
||||
Developer information
|
||||
|
|
|
@ -144,7 +144,9 @@ def generate_json_report(gt, ocr, report_prefix, metrics_results):
|
|||
json.dump(json_dict, fp)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"):
|
||||
def process(
|
||||
gt, ocr, report_prefix, *, html=True, metrics="cer,wer", textequiv_level="region"
|
||||
):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions,
|
||||
|
@ -172,13 +174,15 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
|
|||
metrics_results[result.metric] = result
|
||||
|
||||
generate_json_report(gt, ocr, report_prefix, metrics_results)
|
||||
generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results)
|
||||
if html:
|
||||
generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.option("--html", default=True, is_flag=True, help="Enable/disable html report.")
|
||||
@click.option(
|
||||
"--metrics",
|
||||
default="cer,wer",
|
||||
|
@ -191,7 +195,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
|
|||
metavar="LEVEL",
|
||||
)
|
||||
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
||||
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||
def main(gt, ocr, report_prefix, html, metrics, textequiv_level, progress):
|
||||
"""
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
|
@ -210,12 +214,21 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
|||
The metrics can be chosen via a comma separated combination of their acronyms
|
||||
like "--metrics=ca,wer,boc,bow".
|
||||
|
||||
The html report can be enabled/disabled using --html / --no-html.
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
"""
|
||||
initLogging()
|
||||
Config.progress = progress
|
||||
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
|
||||
process(
|
||||
gt,
|
||||
ocr,
|
||||
report_prefix,
|
||||
html=html,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -18,6 +18,11 @@
|
|||
"recognition/text-recognition"
|
||||
],
|
||||
"parameters": {
|
||||
"html": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable/disable html report."
|
||||
},
|
||||
"metrics": {
|
||||
"type": "string",
|
||||
"enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"],
|
||||
|
@ -28,7 +33,7 @@
|
|||
"type": "string",
|
||||
"enum": ["region", "line"],
|
||||
"default": "region",
|
||||
"description": "PAGE XML hierarchy level to extract the text from"
|
||||
"description": "PAGE XML hierarchy level to extract the text from."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
|
||||
html = self.parameter["html"]
|
||||
metrics = self.parameter["metrics"]
|
||||
textequiv_level = self.parameter["textequiv_level"]
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(",")
|
||||
|
@ -57,15 +58,16 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
html=html,
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
for report_suffix, mimetype in [
|
||||
[".html", "text/html"],
|
||||
[".json", "application/json"],
|
||||
]:
|
||||
report_types = [(".json", "application/json")]
|
||||
if html:
|
||||
report_types.append((".html", "text/html"))
|
||||
for report_suffix, mimetype in report_types:
|
||||
self.workspace.add_file(
|
||||
ID=file_id + report_suffix,
|
||||
file_grp=self.output_file_grp,
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from .util import working_directory
|
||||
|
@ -16,3 +18,19 @@ def test_cli_unknown_metric(tmp_path):
|
|||
|
||||
with pytest.raises(ValueError, match="Unknown metric 'unknown'."):
|
||||
process("gt.txt", "ocr.txt", "report", metrics="cer,unknown")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_html_report_parameter(tmp_path):
|
||||
"""Test that html report can get turned off."""
|
||||
|
||||
with working_directory(str(tmp_path)):
|
||||
with open("gt.txt", "w") as gtf:
|
||||
gtf.write("")
|
||||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("")
|
||||
|
||||
process("gt.txt", "ocr.txt", "report", html=False)
|
||||
|
||||
assert Path("report").with_suffix(".json").exists()
|
||||
assert not Path("report").with_suffix(".html").exists()
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import json
|
||||
from itertools import combinations
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -45,11 +46,10 @@ def test_cli_json(metrics, tmp_path):
|
|||
with open("ocr.txt", "w") as ocrf:
|
||||
ocrf.write("AAAAB")
|
||||
|
||||
with open("gt.txt", "r") as gtf:
|
||||
print(gtf.read())
|
||||
process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics))
|
||||
with open("report.json", "r") as jsonf:
|
||||
print(jsonf.read())
|
||||
|
||||
for suffix in (".json", ".html"):
|
||||
assert Path("report").with_suffix(suffix).exists()
|
||||
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
|
@ -76,6 +76,10 @@ def test_cli_json_extremes(gt, ocr, err, tmp_path):
|
|||
ocrf.write(ocr)
|
||||
|
||||
process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow")
|
||||
|
||||
for suffix in (".json", ".html"):
|
||||
assert Path("report").with_suffix(suffix).exists()
|
||||
|
||||
with open("report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
for metric in set(METRIC_DICT.values()):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue