mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 14:40:00 +02:00
Merge pull request #83 from INL/feat/batch-processing
Add batch processing and report summaries
This commit is contained in:
commit
35be58cb94
17 changed files with 17584 additions and 26 deletions
58
README.md
58
README.md
|
@ -5,7 +5,8 @@ dinglehopper is an OCR evaluation tool and reads
|
|||
[ALTO](https://github.com/altoxml),
|
||||
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
|
||||
compares a ground truth (GT) document page with a OCR result page to compute
|
||||
metrics and a word/character differences report.
|
||||
metrics and a word/character differences report. It also supports batch processing by
|
||||
generating, aggregating and summarizing multiple reports.
|
||||
|
||||
[](https://circleci.com/gh/qurator-spk/dinglehopper)
|
||||
|
||||
|
@ -27,7 +28,7 @@ sudo pip install .
|
|||
Usage
|
||||
-----
|
||||
~~~
|
||||
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
||||
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
|
||||
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
|
@ -35,19 +36,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
|||
their text and falls back to plain text if no ALTO or PAGE is detected.
|
||||
|
||||
The files GT and OCR are usually a ground truth document and the result of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results.
|
||||
In that case, use --no-metrics to disable the then meaningless metrics and
|
||||
also change the color scheme from green/red to blue.
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character
|
||||
error rate (CER) and the word error rate (WER).
|
||||
The comparison report will be written to
|
||||
$REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
|
||||
to the current working directory and $REPORT_PREFIX defaults to "report".
|
||||
The reports include the character error rate (CER) and the word error rate
|
||||
(WER).
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
|
||||
Options:
|
||||
--metrics / --no-metrics Enable/disable metrics and green/red
|
||||
--differences BOOLEAN Enable reporting character and word level
|
||||
differences
|
||||
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
|
||||
--progress Show progress bar
|
||||
--help Show this message and exit.
|
||||
|
@ -61,6 +66,43 @@ This generates `report.html` and `report.json`.
|
|||
|
||||

|
||||
|
||||
Batch comparison between folders of GT and OCR files can be done by simply providing
|
||||
folders:
|
||||
~~~
|
||||
dinglehopper gt/ ocr/ report output_folder/
|
||||
~~~
|
||||
This assumes that you have files with the same name in both folders, e.g.
|
||||
`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
|
||||
|
||||
The example generates reports for each set of files, with the prefix `report`, in the
|
||||
(automatically created) folder `output_folder/`.
|
||||
|
||||
By default, the JSON report does not contain the character and word differences, only
|
||||
the calculated metrics. If you want to include the differences, use the
|
||||
`--differences` flag:
|
||||
|
||||
~~~
|
||||
dinglehopper gt/ ocr/ report output_folder/ --differences
|
||||
~~~
|
||||
|
||||
### dinglehopper-summarize
|
||||
A set of (JSON) reports can be summarized into a single set of
|
||||
reports. This is useful after having generated reports in batch.
|
||||
Example:
|
||||
~~~
|
||||
dinglehopper-summarize output_folder/
|
||||
~~~
|
||||
This generates `summary.html` and `summary.json` in the same `output_folder`.
|
||||
|
||||
If you are summarizing many reports and have used the `--differences` flag while
|
||||
generating them, it may be useful to limit the number of differences reported by using
|
||||
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
|
||||
report, making it easier to open and navigate. Note that the JSON report will still
|
||||
contain all differences. Example:
|
||||
~~~
|
||||
dinglehopper-summarize output_folder/ --occurences-threshold 10
|
||||
~~~
|
||||
|
||||
### dinglehopper-line-dirs
|
||||
You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
|
||||
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from collections import Counter
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
@ -6,15 +7,15 @@ from markupsafe import escape
|
|||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from .character_error_rate import character_error_rate_n
|
||||
from .word_error_rate import word_error_rate_n, words_normalized
|
||||
from .align import seq_align
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import extract
|
||||
from .config import Config
|
||||
from dinglehopper.character_error_rate import character_error_rate_n
|
||||
from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
|
||||
from dinglehopper.align import seq_align
|
||||
from dinglehopper.extracted_text import ExtractedText
|
||||
from dinglehopper.ocr_files import extract
|
||||
from dinglehopper.config import Config
|
||||
|
||||
|
||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
|
||||
gtx = ""
|
||||
ocrx = ""
|
||||
|
||||
|
@ -54,6 +55,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
|
||||
g_pos = 0
|
||||
o_pos = 0
|
||||
found_differences = []
|
||||
|
||||
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
|
||||
css_classes = None
|
||||
gt_id = None
|
||||
|
@ -66,6 +69,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
# Deletions and inserts only produce one id + None, UI must
|
||||
# support this, i.e. display for the one id produced
|
||||
|
||||
if differences:
|
||||
found_differences.append(f'{g} :: {o}')
|
||||
|
||||
gtx += joiner + format_thing(g, css_classes, gt_id)
|
||||
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
||||
|
||||
|
@ -74,6 +80,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
if o is not None:
|
||||
o_pos += len(o)
|
||||
|
||||
found_differences = dict(Counter(elem for elem in found_differences))
|
||||
|
||||
return """
|
||||
<div class="row">
|
||||
<div class="col-md-6 gt">{}</div>
|
||||
|
@ -81,7 +89,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
|
|||
</div>
|
||||
""".format(
|
||||
gtx, ocrx
|
||||
)
|
||||
), found_differences
|
||||
|
||||
|
||||
def json_float(value):
|
||||
|
@ -97,7 +105,8 @@ def json_float(value):
|
|||
return str(value)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||
def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
|
||||
differences=False, textequiv_level="region"):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||
|
@ -110,14 +119,15 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||
|
||||
char_diff_report = gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
|
||||
)
|
||||
char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c",
|
||||
joiner="",
|
||||
none="·", differences=differences)
|
||||
|
||||
gt_words = words_normalized(gt_text)
|
||||
ocr_words = words_normalized(ocr_text)
|
||||
word_diff_report = gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
|
||||
word_diff_report, diff_w = gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
|
||||
differences=differences
|
||||
)
|
||||
|
||||
env = Environment(
|
||||
|
@ -129,7 +139,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "report" + report_suffix + ".j2"
|
||||
out_fn = report_prefix + report_suffix
|
||||
|
||||
if not os.path.isdir(reports_folder):
|
||||
os.mkdir(reports_folder)
|
||||
|
||||
out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
|
@ -142,16 +156,42 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
|
||||
textequiv_level):
|
||||
for gt_file in os.listdir(gt):
|
||||
gt_file_path = os.path.join(gt, gt_file)
|
||||
ocr_file_path = os.path.join(ocr, gt_file)
|
||||
|
||||
if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
|
||||
process(gt_file_path, ocr_file_path,
|
||||
f"{gt_file}-{report_prefix}",
|
||||
reports_folder=reports_folder,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
textequiv_level=textequiv_level)
|
||||
else:
|
||||
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.argument("reports_folder", type=click.Path(), default=".")
|
||||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
@click.option(
|
||||
"--differences",
|
||||
default=False,
|
||||
help="Enable reporting character and word level differences"
|
||||
)
|
||||
@click.option(
|
||||
"--textequiv-level",
|
||||
default="region",
|
||||
|
@ -159,7 +199,8 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
metavar="LEVEL",
|
||||
)
|
||||
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
||||
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||
def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level,
|
||||
progress):
|
||||
"""
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
|
@ -171,7 +212,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
|||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||
The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
|
||||
where $REPORTS_FOLDER defaults to the current working directory and
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||
rate (CER) and the word error rate (WER).
|
||||
|
||||
|
@ -180,7 +222,17 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
|||
"""
|
||||
initLogging()
|
||||
Config.progress = progress
|
||||
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
|
||||
if os.path.isdir(gt):
|
||||
if not os.path.isdir(ocr):
|
||||
raise click.BadParameter(
|
||||
"OCR must be a directory if GT is a directory", param_hint="ocr"
|
||||
)
|
||||
else:
|
||||
process_dir(gt, ocr, report_prefix, reports_folder, metrics,
|
||||
differences, textequiv_level)
|
||||
else:
|
||||
process(gt, ocr, report_prefix, reports_folder, metrics=metrics,
|
||||
differences=differences, textequiv_level=textequiv_level)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
101
dinglehopper/cli_summarize.py
Normal file
101
dinglehopper/cli_summarize.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import click
|
||||
from ocrd_utils import initLogging
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
from dinglehopper.cli import json_float
|
||||
|
||||
|
||||
def process(reports_folder, occurrences_threshold=1):
|
||||
cer_list = []
|
||||
wer_list = []
|
||||
cer_sum = 0
|
||||
wer_sum = 0
|
||||
diff_c = {}
|
||||
diff_w = {}
|
||||
|
||||
for report in os.listdir(reports_folder):
|
||||
if report.endswith(".json"):
|
||||
with open(os.path.join(reports_folder, report), "r") as f:
|
||||
report_data = json.load(f)
|
||||
|
||||
if "cer" not in report_data or "wer" not in report_data:
|
||||
click.echo(
|
||||
f"Skipping {report} because it does not contain CER and WER")
|
||||
continue
|
||||
|
||||
cer = report_data["cer"]
|
||||
wer = report_data["wer"]
|
||||
cer_list.append(cer)
|
||||
wer_list.append(wer)
|
||||
cer_sum += cer
|
||||
wer_sum += wer
|
||||
|
||||
for key, value in report_data["differences"]["character_level"].items():
|
||||
diff_c[key] = diff_c.get(key, 0) + value
|
||||
for key, value in report_data["differences"]["word_level"].items():
|
||||
diff_w[key] = diff_w.get(key, 0) + value
|
||||
|
||||
if len(cer_list) == 0:
|
||||
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
|
||||
return
|
||||
|
||||
cer_avg = cer_sum / len(cer_list)
|
||||
wer_avg = wer_sum / len(wer_list)
|
||||
|
||||
print(f"Number of reports: {len(cer_list)}")
|
||||
print(f"Average CER: {cer_avg}")
|
||||
print(f"Average WER: {wer_avg}")
|
||||
print(f"Sum of common mistakes: {cer_sum}")
|
||||
print(f"Sum of common mistakes: {wer_sum}")
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "summary" + report_suffix + ".j2"
|
||||
|
||||
out_fn = os.path.join(reports_folder, 'summary' + report_suffix)
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
num_reports=len(cer_list),
|
||||
cer_avg=cer_avg,
|
||||
wer_avg=wer_avg,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
occurrences_threshold=occurrences_threshold,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("reports_folder",
|
||||
type=click.Path(exists=True),
|
||||
default="./reports"
|
||||
)
|
||||
@click.option("--occurrences-threshold",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Only show differences that occur at least this many times.")
|
||||
def main(reports_folder, occurrences_threshold):
|
||||
"""
|
||||
Summarize the results from multiple reports generated earlier by dinglehopper.
|
||||
It calculates the average CER and WER, as well as a sum of common mistakes.
|
||||
Reports include lists of mistakes and their occurrences.
|
||||
|
||||
You may use a threshold to reduce the file size of the HTML report by only showing
|
||||
mistakes whose number of occurrences is above the threshold. The JSON report will
|
||||
always contain all mistakes.
|
||||
|
||||
All JSON files in the provided folder will be gathered and summarized.
|
||||
"""
|
||||
initLogging()
|
||||
process(reports_folder, occurrences_threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -26,6 +26,22 @@
|
|||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
@ -50,6 +66,32 @@
|
|||
<h2>Word differences</h2>
|
||||
{{ word_diff_report }}
|
||||
|
||||
{%- if differences %}
|
||||
{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{% for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>GT</th>
|
||||
<th>OCR</th>
|
||||
<th>Occurrences</th>
|
||||
</tr>
|
||||
{% for gt_ocr, occurrences in section['data'].items() %}
|
||||
<tr>
|
||||
<td>{{ gt_ocr.split("::")[0] }}</td>
|
||||
<td>{{ gt_ocr.split("::")[1] }}</td>
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
||||
|
|
|
@ -12,4 +12,28 @@ $(document).ready(function() {
|
|||
$('.diff').mouseout(function() {
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
|
||||
/* Sort this column of the table */
|
||||
$('th').click(function () {
|
||||
var table = $(this).closest('table');
|
||||
var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
|
||||
this.asc = !this.asc;
|
||||
if (!this.asc) {
|
||||
rows = rows.reverse();
|
||||
}
|
||||
for (var i = 0; i < rows.length; i++) {
|
||||
table.children('tbody').append(rows[i]);
|
||||
}
|
||||
});
|
||||
|
||||
function compareRows(index) {
|
||||
return function (row1, row2) {
|
||||
var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
|
||||
var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
|
||||
return cell1.localeCompare(cell2, undefined, {
|
||||
numeric: true,
|
||||
sensitivity: 'base'
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
|
@ -4,6 +4,12 @@
|
|||
{% if metrics %}
|
||||
"cer": {{ cer|json_float }},
|
||||
"wer": {{ wer|json_float }},
|
||||
{% endif %}
|
||||
{% if differences %}
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
},
|
||||
{% endif %}
|
||||
"n_characters": {{ n_characters }},
|
||||
"n_words": {{ n_words }}
|
||||
|
|
136
dinglehopper/templates/summary.html.j2
Normal file
136
dinglehopper/templates/summary.html.j2
Normal file
|
@ -0,0 +1,136 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
{% if metrics %}
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
{% else %}
|
||||
.gt .diff, .ocr .diff {
|
||||
color: blue;
|
||||
}
|
||||
{% endif %}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.cer {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
tr:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
|
||||
td {
|
||||
min-width: 100px;
|
||||
}
|
||||
|
||||
td:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="container">
|
||||
|
||||
<div class="row">
|
||||
<h1>Summary of all reports</h1>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<p>Number of reports: {{ num_reports }}</p>
|
||||
</div>
|
||||
|
||||
{% if cer_avg and wer_avg -%}
|
||||
<div class="row">
|
||||
<h2>Metrics</h2>
|
||||
</div>
|
||||
|
||||
<div class="row cer">
|
||||
<p>Average CER: {{ cer_avg|round(4) }}</p>
|
||||
<p>Average WER: {{ wer_avg|round(4) }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{%- if diff_c and diff_w %}
|
||||
{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{%- for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
|
||||
</thead>
|
||||
{%- set num_omitted = namespace(value=0) -%}
|
||||
{% for gt_ocr, occurrences in section['data'].items() -%}
|
||||
{% if occurrences < occurrences_threshold -%}
|
||||
{%- set num_omitted.value = num_omitted.value + 1 %}
|
||||
{%- else -%}
|
||||
{%- set gt = gt_ocr.split(" :: ")[0] %}
|
||||
{%- set ocr = gt_ocr.split(" :: ")[1] %}
|
||||
<tr>
|
||||
<td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
|
||||
<td title="{{ ocr|urlencode }}">{{ ocr }}</td >
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{% if num_omitted.value > 0 and occurrences_threshold > 1 -%}
|
||||
<p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
|
||||
{%- set num_omitted.value = 0 %}
|
||||
{%- endif %}
|
||||
</table>
|
||||
</div>
|
||||
{%- endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
15
dinglehopper/templates/summary.json.j2
Normal file
15
dinglehopper/templates/summary.json.j2
Normal file
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"num_reports": {{ num_reports}}
|
||||
{%- if cer_avg and wer_avg %}
|
||||
,
|
||||
"cer_avg": {{ cer_avg|json_float }},
|
||||
"wer_avg": {{ wer_avg|json_float }}
|
||||
{%- endif %}
|
||||
{%- if diff_c and wer_avg %}
|
||||
,
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
}
|
||||
{%- endif %}
|
||||
}
|
3394
dinglehopper/tests/data/directory-test/gt/1.xml
Normal file
3394
dinglehopper/tests/data/directory-test/gt/1.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
dinglehopper/tests/data/directory-test/gt/2.xml
Normal file
3394
dinglehopper/tests/data/directory-test/gt/2.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
dinglehopper/tests/data/directory-test/ocr/1.xml
Normal file
3394
dinglehopper/tests/data/directory-test/ocr/1.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
3394
dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
3394
dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
File diff suppressed because it is too large
Load diff
41
dinglehopper/tests/test_integ_cli_dir.py
Normal file
41
dinglehopper/tests/test_integ_cli_dir.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import os
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
from dinglehopper.cli import process_dir
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_directory(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir() processes a directory of files and
|
||||
yields JSON and HTML reports.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report", str(tmp_path / "reports"), False, True,
|
||||
"line")
|
||||
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.html")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.html")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_fail_without_gt(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir skips a file if there is no corresponding file
|
||||
in the other directory.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report", str(tmp_path / "reports"), False, True,
|
||||
"line")
|
||||
|
||||
assert len(os.listdir(tmp_path / "reports")) == 2 * 2
|
27
dinglehopper/tests/test_integ_differences.py
Normal file
27
dinglehopper/tests/test_integ_differences.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
from dinglehopper.cli import process
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_differences(tmp_path):
|
||||
"""Test that the cli/process() yields a JSON report that includes
|
||||
the differences found between the GT and OCR text"""
|
||||
|
||||
initLogging()
|
||||
process(os.path.join(data_dir, "test-gt.page2018.xml"),
|
||||
os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
|
||||
"report", tmp_path, differences=True)
|
||||
|
||||
assert os.path.exists(tmp_path / "report.json")
|
||||
|
||||
with open(tmp_path / "report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
|
||||
assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1},
|
||||
"word_level": {'Augenblick :: Augemblick': 1,
|
||||
'Verſprochene :: Verfprochene': 1}}
|
101
dinglehopper/tests/test_integ_summarize.py
Normal file
101
dinglehopper/tests/test_integ_summarize.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
from .. import cli_summarize
|
||||
|
||||
expected_cer_avg = (0.05 + 0.10) / 2
|
||||
expected_wer_avg = (0.15 + 0.20) / 2
|
||||
expected_diff_c = {"a": 30, "b": 50}
|
||||
expected_diff_w = {"c": 70, "d": 90}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_summaries(tmp_path):
|
||||
"""Create two summary reports with mock data"""
|
||||
reports_dirname = tmp_path / "reports"
|
||||
reports_dirname.mkdir()
|
||||
|
||||
report1 = {"cer": 0.05, "wer": 0.15,
|
||||
"differences": {
|
||||
"character_level": {"a": 10, "b": 20},
|
||||
"word_level": {"c": 30, "d": 40}
|
||||
}}
|
||||
report2 = {"cer": 0.10, "wer": 0.20,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50}
|
||||
}}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
|
||||
json.dump(report1, f)
|
||||
with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
|
||||
json.dump(report2, f)
|
||||
|
||||
return str(reports_dirname)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_json(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields a summarized JSON report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
|
||||
summary_data = json.load(f)
|
||||
|
||||
|
||||
assert summary_data["num_reports"] == 2
|
||||
assert summary_data["cer_avg"] == expected_cer_avg
|
||||
assert summary_data["wer_avg"] == expected_wer_avg
|
||||
assert summary_data["differences"]["character_level"] == expected_diff_c
|
||||
assert summary_data["differences"]["word_level"] == expected_diff_w
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields an HTML report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert len(contents) > 0
|
||||
assert "Number of reports: 2" in contents
|
||||
assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
|
||||
assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
|
||||
"""
|
||||
Test that the cli/process() does not include reports that are missing a WER value.
|
||||
"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
|
||||
# This third report has no WER value and should not be included in the summary
|
||||
report3 = {"cer": 0.10,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50}
|
||||
}}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
|
||||
json.dump(report3, f)
|
||||
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert "Number of reports: 2" in contents # report3 is not included
|
1
setup.py
1
setup.py
|
@ -27,6 +27,7 @@ setup(
|
|||
"dinglehopper=dinglehopper.cli:main",
|
||||
"dinglehopper-line-dirs=dinglehopper.cli_line_dirs:main",
|
||||
"dinglehopper-extract=dinglehopper.cli_extract:main",
|
||||
"dinglehopper-summarize=dinglehopper.cli_summarize:main",
|
||||
"ocrd-dinglehopper=dinglehopper.ocrd_cli:ocrd_dinglehopper",
|
||||
]
|
||||
},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue