@ -46,12 +46,15 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
The metrics can be chosen via a comma separated combination of their acronyms
like "--metrics=ca,wer,boc,bow".
The html report can be enabled/disabled using --html/--no-html.
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
Options:
--metrics Enable different metrics like ca|cer, wa|wer, boc and bow.
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
--html / --no-html Enabling/disabling html report.
--progress Show progress bar
--help Show this message and exit.
~~~
@ -85,10 +88,11 @@ The OCR-D processor has these parameters:
| ------------------------- | ------------------------------------------------------------------- |
| `-P metrics cer,wer` | Enable character error rate and word error rate (default) |
| `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
| `-P html false` | Enabling/disabling html report (default: enabled). |
For example:
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false
Developer information
@ -144,7 +144,9 @@ def generate_json_report(gt, ocr, report_prefix, metrics_results):
json.dump(json_dict, fp)
def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"):
def process(
gt, ocr, report_prefix, *, html=True, metrics="cer,wer", textequiv_level="region"
):
"""Check OCR result against GT.
The @click decorators change the signature of the decorated functions,
@ -172,6 +174,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
metrics_results[result.metric] = result
generate_json_report(gt, ocr, report_prefix, metrics_results)
if html:
generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results)
@ -179,6 +182,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.option("--html", default=True, is_flag=True, help="Enable/disable html report.")
@click.option(
"--metrics",
default="cer,wer",
@ -191,7 +195,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio
metavar="LEVEL",
)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
def main(gt, ocr, report_prefix, html, metrics, textequiv_level, progress):
"""
Compare the PAGE/ALTO/text document GT against the document OCR.
@ -210,12 +214,21 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
The html report can be enabled/disabled using --html / --no-html.
initLogging()
Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
process(
gt,
ocr,
report_prefix,
html=html,
metrics=metrics,
textequiv_level=textequiv_level,
if __name__ == "__main__":
@ -18,6 +18,11 @@
"recognition/text-recognition"
],
"parameters": {
"html": {
"type": "boolean",
"default": true,
"description": "Enable/disable html report."
},
"metrics": {
"type": "string",
"enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"],
@ -28,7 +33,7 @@
"enum": ["region", "line"],
"default": "region",
"description": "PAGE XML hierarchy level to extract the text from"
"description": "PAGE XML hierarchy level to extract the text from."
}
@ -30,6 +30,7 @@ class OcrdDinglehopperEvaluate(Processor):
log = getLogger("processor.OcrdDinglehopperEvaluate")
html = self.parameter["html"]
metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",")
@ -57,15 +58,16 @@ class OcrdDinglehopperEvaluate(Processor):
gt_file.local_filename,
ocr_file.local_filename,
# Add reports to the workspace
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
report_types = [(".json", "application/json")]
report_types.append((".html", "text/html"))
for report_suffix, mimetype in report_types:
self.workspace.add_file(
ID=file_id + report_suffix,
file_grp=self.output_file_grp,
@ -1,3 +1,5 @@
from pathlib import Path
import pytest
from .util import working_directory
@ -16,3 +18,19 @@ def test_cli_unknown_metric(tmp_path):
with pytest.raises(ValueError, match="Unknown metric 'unknown'."):
process("gt.txt", "ocr.txt", "report", metrics="cer,unknown")
@pytest.mark.integration
def test_cli_html_report_parameter(tmp_path):
"""Test that html report can get turned off."""
with working_directory(str(tmp_path)):
with open("gt.txt", "w") as gtf:
gtf.write("")
with open("ocr.txt", "w") as ocrf:
ocrf.write("")
process("gt.txt", "ocr.txt", "report", html=False)
assert Path("report").with_suffix(".json").exists()
assert not Path("report").with_suffix(".html").exists()
@ -1,5 +1,6 @@
import json
from itertools import combinations
@ -45,11 +46,10 @@ def test_cli_json(metrics, tmp_path):
ocrf.write("AAAAB")
with open("gt.txt", "r") as gtf:
print(gtf.read())
process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics))
with open("report.json", "r") as jsonf:
print(jsonf.read())
for suffix in (".json", ".html"):
assert Path("report").with_suffix(suffix).exists()
j = json.load(jsonf)
@ -76,6 +76,10 @@ def test_cli_json_extremes(gt, ocr, err, tmp_path):
ocrf.write(ocr)
process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow")
for metric in set(METRIC_DICT.values()):