Merge pull request #83 from INL/feat/batch-processing

Add batch processing and report summaries
2026-06-13 00:19:24 +02:00 · 2023-05-26 15:28:36 +02:00 · 2023-05-26 15:28:36 +02:00 · 35be58cb94
commit 35be58cb94
parent 6d3a8cecd2 207804e6a6
17 changed files with 17584 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -5,7 +5,8 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report.
+metrics and a word/character differences report. It also supports batch processing by 
 generating, aggregating and summarizing multiple reports.
 [![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper)
@ -27,7 +28,7 @@ sudo pip install .
 Usage
 -----
 ~~~
-Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
+Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
  Compare the PAGE/ALTO/text document GT against the document OCR.
@ -35,19 +36,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  their text and falls back to plain text if no ALTO or PAGE is detected.
  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
-  In that case, use --no-metrics to disable the then meaningless metrics and
+  that case, use --no-metrics to disable the then meaningless metrics and also
-  also change the color scheme from green/red to blue.
+  change the color scheme from green/red to blue.
-  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+  The comparison report will be written to
-  $REPORT_PREFIX defaults to "report". The reports include the character
+  $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
-  error rate (CER) and the word error rate (WER).
+  to the current working directory and $REPORT_PREFIX defaults to "report".
  The reports include the character error rate (CER) and the word error rate
  (WER).
  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.
 Options:
  --metrics / --no-metrics  Enable/disable metrics and green/red
  --differences BOOLEAN     Enable reporting character and word level
                            differences
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --progress                Show progress bar
  --help                    Show this message and exit.
@ -61,6 +66,43 @@ This generates `report.html` and `report.json`.
 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
 Batch comparison between folders of GT and OCR files can be done by simply providing 
 folders:
 ~~~
 dinglehopper gt/ ocr/ report output_folder/
 ~~~
 This assumes that you have files with the same name in both folders, e.g. 
 `gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
 The example generates reports for each set of files, with the prefix `report`, in the 
 (automatically created) folder `output_folder/`.
 By default, the JSON report does not contain the character and word differences, only 
 the calculated metrics. If you want to include the differences, use the 
 `--differences` flag:
 ~~~
 dinglehopper gt/ ocr/ report output_folder/ --differences
 ~~~
 ### dinglehopper-summarize
 A set of (JSON) reports can be summarized into a single set of 
 reports. This is useful after having generated reports in batch.
 Example:
 ~~~
 dinglehopper-summarize output_folder/
 ~~~
 This generates `summary.html` and `summary.json` in the same `output_folder`.
 If you are summarizing many reports and have used the `--differences` flag while
 generating them, it may be useful to limit the number of differences reported by using
 the `--occurences-threshold` parameter. This will reduce the size of the generated HTML 
 report, making it easier to open and navigate. Note that the JSON report will still
 contain all differences. Example:
 ~~~
 dinglehopper-summarize output_folder/ --occurences-threshold 10
 ~~~
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
--- a/dinglehopper/cli.py
+++ b/dinglehopper/cli.py
@ -1,4 +1,5 @@
 import os
 from collections import Counter
 import click
 from jinja2 import Environment, FileSystemLoader
@ -6,15 +7,15 @@ from markupsafe import escape
 from uniseg.graphemecluster import grapheme_clusters
 from ocrd_utils import initLogging
-from .character_error_rate import character_error_rate_n
+from dinglehopper.character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
+from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
+from dinglehopper.align import seq_align
-from .extracted_text import ExtractedText
+from dinglehopper.extracted_text import ExtractedText
-from .ocr_files import extract
+from dinglehopper.ocr_files import extract
-from .config import Config
+from dinglehopper.config import Config
-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
    gtx = ""
    ocrx = ""
@ -54,6 +55,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
    g_pos = 0
    o_pos = 0
    found_differences = []
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
        css_classes = None
        gt_id = None
@ -66,6 +69,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
                # Deletions and inserts only produce one id + None, UI must
                # support this, i.e. display for the one id produced
            if differences:
                found_differences.append(f'{g} :: {o}')
        gtx += joiner + format_thing(g, css_classes, gt_id)
        ocrx += joiner + format_thing(o, css_classes, ocr_id)
@ -74,6 +80,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        if o is not None:
            o_pos += len(o)
    found_differences = dict(Counter(elem for elem in found_differences))
    return """
        <div class="row">
           <div class="col-md-6 gt">{}</div>
@ -81,7 +89,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        </div>
        """.format(
        gtx, ocrx
-    )
+    ), found_differences
 def json_float(value):
@ -97,7 +105,8 @@ def json_float(value):
        return str(value)
-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
            differences=False, textequiv_level="region"):
    """Check OCR result against GT.
    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -110,14 +119,15 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
    wer, n_words = word_error_rate_n(gt_text, ocr_text)
-    char_diff_report = gen_diff_report(
+    char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c",
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+                                               joiner="",
-    )
+                                               none="·", differences=differences)
    gt_words = words_normalized(gt_text)
    ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(
+    word_diff_report, diff_w = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
        differences=differences
    )
    env = Environment(
@ -129,7 +139,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    for report_suffix in (".html", ".json"):
        template_fn = "report" + report_suffix + ".j2"
-        out_fn = report_prefix + report_suffix
+
        if not os.path.isdir(reports_folder):
            os.mkdir(reports_folder)
        out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
        template = env.get_template(template_fn)
        template.stream(
@ -142,16 +156,42 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
            differences=differences,
            diff_c=diff_c,
            diff_w=diff_w,
        ).dump(out_fn)
 def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
                textequiv_level):
    for gt_file in os.listdir(gt):
        gt_file_path = os.path.join(gt, gt_file)
        ocr_file_path = os.path.join(ocr, gt_file)
        if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
            process(gt_file_path, ocr_file_path,
                    f"{gt_file}-{report_prefix}",
                    reports_folder=reports_folder,
                    metrics=metrics,
                    differences=differences,
                    textequiv_level=textequiv_level)
        else:
            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@click.command()
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.argument("reports_folder", type=click.Path(), default=".")
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
@click.option(
    "--differences",
    default=False,
    help="Enable reporting character and word level differences"
 )
@click.option(
    "--textequiv-level",
    default="region",
@ -159,7 +199,8 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    metavar="LEVEL",
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level,
         progress):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.
@ -171,7 +212,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
-    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
    where $REPORTS_FOLDER defaults to the current working directory and
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).
@ -180,7 +222,17 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    initLogging()
    Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    if os.path.isdir(gt):
        if not os.path.isdir(ocr):
            raise click.BadParameter(
                "OCR must be a directory if GT is a directory", param_hint="ocr"
            )
        else:
            process_dir(gt, ocr, report_prefix, reports_folder, metrics,
                        differences, textequiv_level)
    else:
        process(gt, ocr, report_prefix, reports_folder, metrics=metrics,
                differences=differences, textequiv_level=textequiv_level)
 if __name__ == "__main__":
--- a/dinglehopper/cli_summarize.py
+++ b/dinglehopper/cli_summarize.py
@ -0,0 +1,101 @@
 import json
 import os
 import click
 from ocrd_utils import initLogging
 from jinja2 import Environment, FileSystemLoader
 from dinglehopper.cli import json_float
 def process(reports_folder, occurrences_threshold=1):
    cer_list = []
    wer_list = []
    cer_sum = 0
    wer_sum = 0
    diff_c = {}
    diff_w = {}
    for report in os.listdir(reports_folder):
        if report.endswith(".json"):
            with open(os.path.join(reports_folder, report), "r") as f:
                report_data = json.load(f)
                if "cer" not in report_data or "wer" not in report_data:
                    click.echo(
                        f"Skipping {report} because it does not contain CER and WER")
                    continue
                cer = report_data["cer"]
                wer = report_data["wer"]
                cer_list.append(cer)
                wer_list.append(wer)
                cer_sum += cer
                wer_sum += wer
                for key, value in report_data["differences"]["character_level"].items():
                    diff_c[key] = diff_c.get(key, 0) + value
                for key, value in report_data["differences"]["word_level"].items():
                    diff_w[key] = diff_w.get(key, 0) + value
    if len(cer_list) == 0:
        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
        return
    cer_avg = cer_sum / len(cer_list)
    wer_avg = wer_sum / len(wer_list)
    print(f"Number of reports: {len(cer_list)}")
    print(f"Average CER: {cer_avg}")
    print(f"Average WER: {wer_avg}")
    print(f"Sum of common mistakes: {cer_sum}")
    print(f"Sum of common mistakes: {wer_sum}")
    env = Environment(
        loader=FileSystemLoader(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
        )
    )
    env.filters["json_float"] = json_float
    for report_suffix in (".html", ".json"):
        template_fn = "summary" + report_suffix + ".j2"
        out_fn = os.path.join(reports_folder, 'summary' + report_suffix)
        template = env.get_template(template_fn)
        template.stream(
            num_reports=len(cer_list),
            cer_avg=cer_avg,
            wer_avg=wer_avg,
            diff_c=diff_c,
            diff_w=diff_w,
            occurrences_threshold=occurrences_threshold,
        ).dump(out_fn)
@click.command()
@click.argument("reports_folder",
                type=click.Path(exists=True),
                default="./reports"
                )
@click.option("--occurrences-threshold",
              type=int,
              default=1,
              help="Only show differences that occur at least this many times.")
 def main(reports_folder, occurrences_threshold):
    """
    Summarize the results from multiple reports generated earlier by dinglehopper.
    It calculates the average CER and WER, as well as a sum of common mistakes.
    Reports include lists of mistakes and their occurrences.
    You may use a threshold to reduce the file size of the HTML report by only showing
    mistakes whose number of occurrences is above the threshold. The JSON report will
    always contain all mistakes.
    All JSON files in the provided folder will be gathered and summarized.
    """
    initLogging()
    process(reports_folder, occurrences_threshold)
 if __name__ == "__main__":
    main()
--- a/dinglehopper/templates/report.html.j2
+++ b/dinglehopper/templates/report.html.j2
@ -26,6 +26,22 @@
      border: 2px solid;
      border-radius: 5px;
    }
    .row {
        margin-bottom: 20px;
    }
    table {
        width: 100%;
    }
    th {
        cursor: pointer;
    }
    th:hover {
        background-color: #eee;
    }
    </style>
 </head>
 <body>
@ -50,6 +66,32 @@
 <h2>Word differences</h2>
 {{ word_diff_report }}
 {%- if differences %}
 {% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
 <div class="row">
 {% for section in sections %}
    <div class="col-md-6">
        <h2>{{ section['title'] }}</h2>
        <table>
            <thead>
            <tr>
                <th>GT</th>
                <th>OCR</th>
                <th>Occurrences</th>
            </tr>
            {% for gt_ocr, occurrences in section['data'].items() %}
                <tr>
                    <td>{{ gt_ocr.split("::")[0] }}</td>
                    <td>{{ gt_ocr.split("::")[1] }}</td>
                    <td>{{ occurrences }}</td>
                </tr>
            {% endfor %}
        </table>
    </div>
 {% endfor %}
 </div>
 {%- endif %}
 </div>
--- a/dinglehopper/templates/report.html.js
+++ b/dinglehopper/templates/report.html.js
@ -12,4 +12,28 @@ $(document).ready(function() {
    $('.diff').mouseout(function() {
        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
    });
    /* Sort this column of the table */
    $('th').click(function () {
        var table = $(this).closest('table');
        var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
        this.asc = !this.asc;
        if (!this.asc) {
            rows = rows.reverse();
        }
        for (var i = 0; i < rows.length; i++) {
            table.children('tbody').append(rows[i]);
        }
    });
    function compareRows(index) {
        return function (row1, row2) {
            var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
            var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
            return cell1.localeCompare(cell2, undefined, {
                numeric: true,
                sensitivity: 'base'
            });
        }
    }
 });
--- a/dinglehopper/templates/report.json.j2
+++ b/dinglehopper/templates/report.json.j2
@ -4,6 +4,12 @@
 {% if metrics %}
    "cer": {{ cer|json_float }},
    "wer": {{ wer|json_float }},
 {% endif %}
 {% if differences %}
    "differences": {
        "character_level": {{ diff_c|tojson }},
        "word_level": {{ diff_w|tojson }}
    },
 {% endif %}
    "n_characters": {{ n_characters }},
    "n_words": {{ n_words }}
--- a/dinglehopper/templates/summary.html.j2
+++ b/dinglehopper/templates/summary.html.j2
@ -0,0 +1,136 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
    <style type="text/css">
    {% if metrics %}
    .gt .diff {
        color: green;
    }
    .ocr .diff {
        color: red;
    }
    {% else %}
    .gt .diff, .ocr .diff {
        color: blue;
    }
    {% endif %}
    .ellipsis {
        opacity: 0.5;
        font-style: italic;
    }
    .diff-highlight {
      border: 2px solid;
      border-radius: 5px;
    }
    .row {
        margin-bottom: 20px;
    }
    table {
        width: 100%;
    }
    .cer {
        flex-direction: column;
    }
    tr:hover {
        background-color: #f5f5f5;
    }
    th {
        cursor: pointer;
    }
    th:hover {
        background-color: #eee;
    }
    td {
        min-width: 100px;
    }
    td:hover {
        background-color: #eee;
    }
    </style>
 </head>
 <body>
 <div class="container">
 <div class="row">
    <h1>Summary of all reports</h1>
 </div>
 <div class="row">
    <p>Number of reports: {{ num_reports }}</p>
 </div>
 {% if cer_avg and wer_avg -%}
 <div class="row">
    <h2>Metrics</h2>
 </div>
 <div class="row cer">
    <p>Average CER: {{ cer_avg|round(4) }}</p>
    <p>Average WER: {{ wer_avg|round(4) }}</p>
 </div>
 {% endif %}
 {%- if diff_c and diff_w %}
 {%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
 <div class="row">
 {%- for section in sections %}
    <div class="col-md-6">
        <h2>{{ section['title'] }}</h2>
        <table>
            <thead>
            <tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
            </thead>
            {%- set num_omitted = namespace(value=0) -%}
            {% for gt_ocr, occurrences in section['data'].items() -%}
                {% if occurrences < occurrences_threshold -%}
                    {%- set num_omitted.value = num_omitted.value + 1 %}
                {%- else -%}
                    {%- set gt = gt_ocr.split(" :: ")[0] %}
                    {%- set ocr = gt_ocr.split(" :: ")[1] %}
                    <tr>
                        <td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
                        <td title="{{ ocr|urlencode }}">{{ ocr }}</td >
                        <td>{{ occurrences }}</td>
                    </tr>
                {%- endif %}
            {%- endfor %}
            {% if num_omitted.value > 0  and occurrences_threshold > 1 -%}
                <p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
                {%- set num_omitted.value = 0 %}
            {%- endif %}
        </table>
    </div>
 {%- endfor %}
 </div>
 {%- endif %}
 </div>
 <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
 <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
 <script>
 {% include 'report.html.js' %}
 </script>
 </body>
 </html>
--- a/dinglehopper/templates/summary.json.j2
+++ b/dinglehopper/templates/summary.json.j2
@ -0,0 +1,15 @@
 {
 "num_reports": {{ num_reports}}
 {%- if cer_avg and wer_avg %}
    ,
    "cer_avg": {{ cer_avg|json_float }},
    "wer_avg": {{ wer_avg|json_float }}
 {%- endif %}
 {%- if diff_c and wer_avg %}
    ,
    "differences": {
        "character_level": {{ diff_c|tojson }},
        "word_level": {{ diff_w|tojson }}
    }
 {%- endif %}
 }
--- a/dinglehopper/tests/data/directory-test/gt/1.xml
+++ b/dinglehopper/tests/data/directory-test/gt/1.xml
--- a/dinglehopper/tests/data/directory-test/gt/2.xml
+++ b/dinglehopper/tests/data/directory-test/gt/2.xml
--- a/dinglehopper/tests/data/directory-test/ocr/1.xml
+++ b/dinglehopper/tests/data/directory-test/ocr/1.xml
--- a/dinglehopper/tests/data/directory-test/ocr/2.xml
+++ b/dinglehopper/tests/data/directory-test/ocr/2.xml
--- a/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
+++ b/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
--- a/dinglehopper/tests/test_integ_cli_dir.py
+++ b/dinglehopper/tests/test_integ_cli_dir.py
@ -0,0 +1,41 @@
 import os
 import pytest
 from ocrd_utils import initLogging
 from dinglehopper.cli import process_dir
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_cli_directory(tmp_path):
    """
    Test that the cli/process_dir() processes a directory of files and
    yields JSON and HTML reports.
    """
    initLogging()
    process_dir(os.path.join(data_dir, "directory-test", "gt"),
                os.path.join(data_dir, "directory-test", "ocr"),
                "report", str(tmp_path / "reports"), False, True,
                "line")
    assert os.path.exists(tmp_path / "reports/1.xml-report.json")
    assert os.path.exists(tmp_path / "reports/1.xml-report.html")
    assert os.path.exists(tmp_path / "reports/2.xml-report.json")
    assert os.path.exists(tmp_path / "reports/2.xml-report.html")
@pytest.mark.integration
 def test_cli_fail_without_gt(tmp_path):
    """
    Test that the cli/process_dir skips a file if there is no corresponding file
    in the other directory.
    """
    initLogging()
    process_dir(os.path.join(data_dir, "directory-test", "gt"),
                os.path.join(data_dir, "directory-test", "ocr"),
                "report", str(tmp_path / "reports"), False, True,
                "line")
    assert len(os.listdir(tmp_path / "reports")) == 2 * 2
--- a/dinglehopper/tests/test_integ_differences.py
+++ b/dinglehopper/tests/test_integ_differences.py
@ -0,0 +1,27 @@
 import json
 import os
 import pytest
 from ocrd_utils import initLogging
 from dinglehopper.cli import process
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_cli_differences(tmp_path):
    """Test that the cli/process() yields a JSON report that includes
        the differences found between the GT and OCR text"""
    initLogging()
    process(os.path.join(data_dir, "test-gt.page2018.xml"),
            os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
            "report", tmp_path, differences=True)
    assert os.path.exists(tmp_path / "report.json")
    with open(tmp_path / "report.json", "r") as jsonf:
        j = json.load(jsonf)
        assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1},
                                    "word_level": {'Augenblick :: Augemblick': 1,
                                                   'Verſprochene :: Verfprochene': 1}}
--- a/dinglehopper/tests/test_integ_summarize.py
+++ b/dinglehopper/tests/test_integ_summarize.py
@ -0,0 +1,101 @@
 import json
 import os
 import pytest
 from .util import working_directory
 from .. import cli_summarize
 expected_cer_avg = (0.05 + 0.10) / 2
 expected_wer_avg = (0.15 + 0.20) / 2
 expected_diff_c = {"a": 30, "b": 50}
 expected_diff_w = {"c": 70, "d": 90}
@pytest.fixture
 def create_summaries(tmp_path):
    """Create two summary reports with mock data"""
    reports_dirname = tmp_path / "reports"
    reports_dirname.mkdir()
    report1 = {"cer": 0.05, "wer": 0.15,
               "differences": {
                   "character_level": {"a": 10, "b": 20},
                   "word_level": {"c": 30, "d": 40}
               }}
    report2 = {"cer": 0.10, "wer": 0.20,
               "differences": {
                   "character_level": {"a": 20, "b": 30},
                   "word_level": {"c": 40, "d": 50}
               }}
    with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
        json.dump(report1, f)
    with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
        json.dump(report2, f)
    return str(reports_dirname)
@pytest.mark.integration
 def test_cli_summarize_json(tmp_path, create_summaries):
    """Test that the cli/process() yields a summarized JSON report"""
    with working_directory(tmp_path):
        reports_dirname = create_summaries
        cli_summarize.process(reports_dirname)
        with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
            summary_data = json.load(f)
        assert summary_data["num_reports"] == 2
        assert summary_data["cer_avg"] == expected_cer_avg
        assert summary_data["wer_avg"] == expected_wer_avg
        assert summary_data["differences"]["character_level"] == expected_diff_c
        assert summary_data["differences"]["word_level"] == expected_diff_w
@pytest.mark.integration
 def test_cli_summarize_html(tmp_path, create_summaries):
    """Test that the cli/process() yields an HTML report"""
    with working_directory(tmp_path):
        reports_dirname = create_summaries
        cli_summarize.process(reports_dirname)
        html_file = os.path.join(reports_dirname, "summary.html")
        assert os.path.isfile(html_file)
        with open(html_file, "r") as f:
            contents = f.read()
            assert len(contents) > 0
            assert "Number of reports: 2" in contents
            assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
            assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
@pytest.mark.integration
 def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
    """
    Test that the cli/process() does not include reports that are missing a WER value.
    """
    with working_directory(tmp_path):
        reports_dirname = create_summaries
        # This third report has no WER value and should not be included in the summary
        report3 = {"cer": 0.10,
                   "differences": {
                       "character_level": {"a": 20, "b": 30},
                       "word_level": {"c": 40, "d": 50}
                   }}
        with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
            json.dump(report3, f)
        cli_summarize.process(reports_dirname)
        html_file = os.path.join(reports_dirname, "summary.html")
        assert os.path.isfile(html_file)
        with open(html_file, "r") as f:
            contents = f.read()
            assert "Number of reports: 2" in contents  # report3 is not included
--- a/setup.py
+++ b/setup.py
@ -27,6 +27,7 @@ setup(
            "dinglehopper=dinglehopper.cli:main",
            "dinglehopper-line-dirs=dinglehopper.cli_line_dirs:main",
            "dinglehopper-extract=dinglehopper.cli_extract:main",
            "dinglehopper-summarize=dinglehopper.cli_summarize:main",
            "ocrd-dinglehopper=dinglehopper.ocrd_cli:ocrd_dinglehopper",
        ]
    },