Merge pull request #83 from INL/feat/batch-processing

Add batch processing and report summaries
2025-08-23 08:29:57 +02:00 · 2023-05-26 15:28:36 +02:00 · 2023-05-26 15:28:36 +02:00 · 35be58cb94
commit 35be58cb94
parent 6d3a8cecd2 207804e6a6
17 changed files with 17584 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -5,7 +5,8 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report.
+metrics and a word/character differences report. It also supports batch processing by 
+generating, aggregating and summarizing multiple reports.

 [![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper)

@ -27,7 +28,7 @@ sudo pip install .
 Usage
 -----
 ~~~
-Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
+Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]

  Compare the PAGE/ALTO/text document GT against the document OCR.

@ -35,19 +36,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  their text and falls back to plain text if no ALTO or PAGE is detected.

  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
-  In that case, use --no-metrics to disable the then meaningless metrics and
-  also change the color scheme from green/red to blue.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
+  that case, use --no-metrics to disable the then meaningless metrics and also
+  change the color scheme from green/red to blue.

-  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
-  error rate (CER) and the word error rate (WER).
+  The comparison report will be written to
+  $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
+  to the current working directory and $REPORT_PREFIX defaults to "report".
+  The reports include the character error rate (CER) and the word error rate
+  (WER).

  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.

 Options:
  --metrics / --no-metrics  Enable/disable metrics and green/red
+  --differences BOOLEAN     Enable reporting character and word level
+                            differences
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --progress                Show progress bar
  --help                    Show this message and exit.
@ -61,6 +66,43 @@ This generates `report.html` and `report.json`.

 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)

+Batch comparison between folders of GT and OCR files can be done by simply providing 
+folders:
+~~~
+dinglehopper gt/ ocr/ report output_folder/
+~~~
+This assumes that you have files with the same name in both folders, e.g. 
+`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
+
+The example generates reports for each set of files, with the prefix `report`, in the 
+(automatically created) folder `output_folder/`.
+
+By default, the JSON report does not contain the character and word differences, only 
+the calculated metrics. If you want to include the differences, use the 
+`--differences` flag:
+
+~~~
+dinglehopper gt/ ocr/ report output_folder/ --differences
+~~~
+
+### dinglehopper-summarize
+A set of (JSON) reports can be summarized into a single set of 
+reports. This is useful after having generated reports in batch.
+Example:
+~~~
+dinglehopper-summarize output_folder/
+~~~
+This generates `summary.html` and `summary.json` in the same `output_folder`.
+
+If you are summarizing many reports and have used the `--differences` flag while
+generating them, it may be useful to limit the number of differences reported by using
+the `--occurences-threshold` parameter. This will reduce the size of the generated HTML 
+report, making it easier to open and navigate. Note that the JSON report will still
+contain all differences. Example:
+~~~
+dinglehopper-summarize output_folder/ --occurences-threshold 10
+~~~
+
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
--- a/dinglehopper/cli.py
+++ b/dinglehopper/cli.py
@ -1,4 +1,5 @@
 import os
+from collections import Counter

 import click
 from jinja2 import Environment, FileSystemLoader
@ -6,15 +7,15 @@ from markupsafe import escape
 from uniseg.graphemecluster import grapheme_clusters
 from ocrd_utils import initLogging

-from .character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
-from .extracted_text import ExtractedText
-from .ocr_files import extract
-from .config import Config
+from dinglehopper.character_error_rate import character_error_rate_n
+from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
+from dinglehopper.align import seq_align
+from dinglehopper.extracted_text import ExtractedText
+from dinglehopper.ocr_files import extract
+from dinglehopper.config import Config


-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
    gtx = ""
    ocrx = ""

@ -54,6 +55,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):

    g_pos = 0
    o_pos = 0
+    found_differences = []
+
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
        css_classes = None
        gt_id = None
@ -66,6 +69,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
                # Deletions and inserts only produce one id + None, UI must
                # support this, i.e. display for the one id produced

+            if differences:
+                found_differences.append(f'{g} :: {o}')
+
        gtx += joiner + format_thing(g, css_classes, gt_id)
        ocrx += joiner + format_thing(o, css_classes, ocr_id)

@ -74,6 +80,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        if o is not None:
            o_pos += len(o)

+    found_differences = dict(Counter(elem for elem in found_differences))
+
    return """
        <div class="row">
           <div class="col-md-6 gt">{}</div>
@ -81,7 +89,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
        </div>
        """.format(
        gtx, ocrx
-    )
+    ), found_differences


 def json_float(value):
@ -97,7 +105,8 @@ def json_float(value):
        return str(value)


-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
+            differences=False, textequiv_level="region"):
    """Check OCR result against GT.

    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@ -110,14 +119,15 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
    wer, n_words = word_error_rate_n(gt_text, ocr_text)

-    char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·"
-    )
+    char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c",
+                                               joiner="",
+                                               none="·", differences=differences)

    gt_words = words_normalized(gt_text)
    ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+    word_diff_report, diff_w = gen_diff_report(
+        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
+        differences=differences
    )

    env = Environment(
@ -129,7 +139,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):

    for report_suffix in (".html", ".json"):
        template_fn = "report" + report_suffix + ".j2"
-        out_fn = report_prefix + report_suffix
+
+        if not os.path.isdir(reports_folder):
+            os.mkdir(reports_folder)
+
+        out_fn = os.path.join(reports_folder, report_prefix + report_suffix)

        template = env.get_template(template_fn)
        template.stream(
@ -142,16 +156,42 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
+            differences=differences,
+            diff_c=diff_c,
+            diff_w=diff_w,
        ).dump(out_fn)


+def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
+                textequiv_level):
+    for gt_file in os.listdir(gt):
+        gt_file_path = os.path.join(gt, gt_file)
+        ocr_file_path = os.path.join(ocr, gt_file)
+
+        if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
+            process(gt_file_path, ocr_file_path,
+                    f"{gt_file}-{report_prefix}",
+                    reports_folder=reports_folder,
+                    metrics=metrics,
+                    differences=differences,
+                    textequiv_level=textequiv_level)
+        else:
+            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
+
+
@click.command()
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
+@click.argument("reports_folder", type=click.Path(), default=".")
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
+@click.option(
+    "--differences",
+    default=False,
+    help="Enable reporting character and word level differences"
+)
@click.option(
    "--textequiv-level",
    default="region",
@ -159,7 +199,8 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    metavar="LEVEL",
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level,
+         progress):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.

@ -171,7 +212,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.

-    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
+    where $REPORTS_FOLDER defaults to the current working directory and
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

@ -180,7 +222,17 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    initLogging()
    Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    if os.path.isdir(gt):
+        if not os.path.isdir(ocr):
+            raise click.BadParameter(
+                "OCR must be a directory if GT is a directory", param_hint="ocr"
+            )
+        else:
+            process_dir(gt, ocr, report_prefix, reports_folder, metrics,
+                        differences, textequiv_level)
+    else:
+        process(gt, ocr, report_prefix, reports_folder, metrics=metrics,
+                differences=differences, textequiv_level=textequiv_level)


 if __name__ == "__main__":
--- a/dinglehopper/cli_summarize.py
+++ b/dinglehopper/cli_summarize.py
@ -0,0 +1,101 @@
+import json
+import os
+
+import click
+from ocrd_utils import initLogging
+from jinja2 import Environment, FileSystemLoader
+
+from dinglehopper.cli import json_float
+
+
+def process(reports_folder, occurrences_threshold=1):
+    cer_list = []
+    wer_list = []
+    cer_sum = 0
+    wer_sum = 0
+    diff_c = {}
+    diff_w = {}
+
+    for report in os.listdir(reports_folder):
+        if report.endswith(".json"):
+            with open(os.path.join(reports_folder, report), "r") as f:
+                report_data = json.load(f)
+
+                if "cer" not in report_data or "wer" not in report_data:
+                    click.echo(
+                        f"Skipping {report} because it does not contain CER and WER")
+                    continue
+
+                cer = report_data["cer"]
+                wer = report_data["wer"]
+                cer_list.append(cer)
+                wer_list.append(wer)
+                cer_sum += cer
+                wer_sum += wer
+
+                for key, value in report_data["differences"]["character_level"].items():
+                    diff_c[key] = diff_c.get(key, 0) + value
+                for key, value in report_data["differences"]["word_level"].items():
+                    diff_w[key] = diff_w.get(key, 0) + value
+
+    if len(cer_list) == 0:
+        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
+        return
+
+    cer_avg = cer_sum / len(cer_list)
+    wer_avg = wer_sum / len(wer_list)
+
+    print(f"Number of reports: {len(cer_list)}")
+    print(f"Average CER: {cer_avg}")
+    print(f"Average WER: {wer_avg}")
+    print(f"Sum of common mistakes: {cer_sum}")
+    print(f"Sum of common mistakes: {wer_sum}")
+
+    env = Environment(
+        loader=FileSystemLoader(
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+        )
+    )
+    env.filters["json_float"] = json_float
+    for report_suffix in (".html", ".json"):
+        template_fn = "summary" + report_suffix + ".j2"
+
+        out_fn = os.path.join(reports_folder, 'summary' + report_suffix)
+        template = env.get_template(template_fn)
+        template.stream(
+            num_reports=len(cer_list),
+            cer_avg=cer_avg,
+            wer_avg=wer_avg,
+            diff_c=diff_c,
+            diff_w=diff_w,
+            occurrences_threshold=occurrences_threshold,
+        ).dump(out_fn)
+
+
+@click.command()
+@click.argument("reports_folder",
+                type=click.Path(exists=True),
+                default="./reports"
+                )
+@click.option("--occurrences-threshold",
+              type=int,
+              default=1,
+              help="Only show differences that occur at least this many times.")
+def main(reports_folder, occurrences_threshold):
+    """
+    Summarize the results from multiple reports generated earlier by dinglehopper.
+    It calculates the average CER and WER, as well as a sum of common mistakes.
+    Reports include lists of mistakes and their occurrences.
+
+    You may use a threshold to reduce the file size of the HTML report by only showing
+    mistakes whose number of occurrences is above the threshold. The JSON report will
+    always contain all mistakes.
+
+    All JSON files in the provided folder will be gathered and summarized.
+    """
+    initLogging()
+    process(reports_folder, occurrences_threshold)
+
+
+if __name__ == "__main__":
+    main()
--- a/dinglehopper/templates/report.html.j2
+++ b/dinglehopper/templates/report.html.j2
@ -26,6 +26,22 @@
      border: 2px solid;
      border-radius: 5px;
    }
+
+    .row {
+        margin-bottom: 20px;
+    }
+
+    table {
+        width: 100%;
+    }
+
+    th {
+        cursor: pointer;
+    }
+
+    th:hover {
+        background-color: #eee;
+    }
    </style>
 </head>
 <body>
@ -50,6 +66,32 @@
 <h2>Word differences</h2>
 {{ word_diff_report }}

+{%- if differences %}
+{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
+
+<div class="row">
+{% for section in sections %}
+    <div class="col-md-6">
+        <h2>{{ section['title'] }}</h2>
+        <table>
+            <thead>
+            <tr>
+                <th>GT</th>
+                <th>OCR</th>
+                <th>Occurrences</th>
+            </tr>
+            {% for gt_ocr, occurrences in section['data'].items() %}
+                <tr>
+                    <td>{{ gt_ocr.split("::")[0] }}</td>
+                    <td>{{ gt_ocr.split("::")[1] }}</td>
+                    <td>{{ occurrences }}</td>
+                </tr>
+            {% endfor %}
+        </table>
+    </div>
+{% endfor %}
+</div>
+{%- endif %}

 </div>

--- a/dinglehopper/templates/report.html.js
+++ b/dinglehopper/templates/report.html.js
@ -12,4 +12,28 @@ $(document).ready(function() {
    $('.diff').mouseout(function() {
        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
    });
+
+    /* Sort this column of the table */
+    $('th').click(function () {
+        var table = $(this).closest('table');
+        var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
+        this.asc = !this.asc;
+        if (!this.asc) {
+            rows = rows.reverse();
+        }
+        for (var i = 0; i < rows.length; i++) {
+            table.children('tbody').append(rows[i]);
+        }
+    });
+
+    function compareRows(index) {
+        return function (row1, row2) {
+            var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
+            var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
+            return cell1.localeCompare(cell2, undefined, {
+                numeric: true,
+                sensitivity: 'base'
+            });
+        }
+    }
 });
--- a/dinglehopper/templates/report.json.j2
+++ b/dinglehopper/templates/report.json.j2
@ -4,6 +4,12 @@
 {% if metrics %}
    "cer": {{ cer|json_float }},
    "wer": {{ wer|json_float }},
+{% endif %}
+{% if differences %}
+    "differences": {
+        "character_level": {{ diff_c|tojson }},
+        "word_level": {{ diff_w|tojson }}
+    },
 {% endif %}
    "n_characters": {{ n_characters }},
    "n_words": {{ n_words }}
--- a/dinglehopper/templates/summary.html.j2
+++ b/dinglehopper/templates/summary.html.j2
@ -0,0 +1,136 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
+    <style type="text/css">
+    {% if metrics %}
+    .gt .diff {
+        color: green;
+    }
+    .ocr .diff {
+        color: red;
+    }
+    {% else %}
+    .gt .diff, .ocr .diff {
+        color: blue;
+    }
+    {% endif %}
+    .ellipsis {
+        opacity: 0.5;
+        font-style: italic;
+    }
+    .diff-highlight {
+      border: 2px solid;
+      border-radius: 5px;
+    }
+
+    .row {
+        margin-bottom: 20px;
+    }
+
+    table {
+        width: 100%;
+    }
+
+    .cer {
+        flex-direction: column;
+    }
+
+    tr:hover {
+        background-color: #f5f5f5;
+    }
+
+    th {
+        cursor: pointer;
+    }
+
+    th:hover {
+        background-color: #eee;
+    }
+
+    td {
+        min-width: 100px;
+    }
+
+    td:hover {
+        background-color: #eee;
+    }
+    </style>
+</head>
+<body>
+
+<div class="container">
+
+<div class="row">
+    <h1>Summary of all reports</h1>
+</div>
+
+<div class="row">
+    <p>Number of reports: {{ num_reports }}</p>
+</div>
+
+{% if cer_avg and wer_avg -%}
+<div class="row">
+    <h2>Metrics</h2>
+</div>
+
+<div class="row cer">
+    <p>Average CER: {{ cer_avg|round(4) }}</p>
+    <p>Average WER: {{ wer_avg|round(4) }}</p>
+</div>
+{% endif %}
+
+{%- if diff_c and diff_w %}
+{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
+
+<div class="row">
+{%- for section in sections %}
+    <div class="col-md-6">
+        <h2>{{ section['title'] }}</h2>
+        <table>
+            <thead>
+            <tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
+            </thead>
+            {%- set num_omitted = namespace(value=0) -%}
+            {% for gt_ocr, occurrences in section['data'].items() -%}
+                {% if occurrences < occurrences_threshold -%}
+                    {%- set num_omitted.value = num_omitted.value + 1 %}
+                {%- else -%}
+                    {%- set gt = gt_ocr.split(" :: ")[0] %}
+                    {%- set ocr = gt_ocr.split(" :: ")[1] %}
+                    <tr>
+                        <td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
+                        <td title="{{ ocr|urlencode }}">{{ ocr }}</td >
+                        <td>{{ occurrences }}</td>
+                    </tr>
+                {%- endif %}
+            {%- endfor %}
+
+            {% if num_omitted.value > 0  and occurrences_threshold > 1 -%}
+                <p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
+                {%- set num_omitted.value = 0 %}
+            {%- endif %}
+        </table>
+    </div>
+{%- endfor %}
+</div>
+{%- endif %}
+
+</div>
+
+
+
+<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
+<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
+
+<script>
+{% include 'report.html.js' %}
+</script>
+
+
+</body>
+</html>
--- a/dinglehopper/templates/summary.json.j2
+++ b/dinglehopper/templates/summary.json.j2
@ -0,0 +1,15 @@
+{
+"num_reports": {{ num_reports}}
+{%- if cer_avg and wer_avg %}
+    ,
+    "cer_avg": {{ cer_avg|json_float }},
+    "wer_avg": {{ wer_avg|json_float }}
+{%- endif %}
+{%- if diff_c and wer_avg %}
+    ,
+    "differences": {
+        "character_level": {{ diff_c|tojson }},
+        "word_level": {{ diff_w|tojson }}
+    }
+{%- endif %}
+}
--- a/dinglehopper/tests/data/directory-test/gt/1.xml
+++ b/dinglehopper/tests/data/directory-test/gt/1.xml
--- a/dinglehopper/tests/data/directory-test/gt/2.xml
+++ b/dinglehopper/tests/data/directory-test/gt/2.xml
--- a/dinglehopper/tests/data/directory-test/ocr/1.xml
+++ b/dinglehopper/tests/data/directory-test/ocr/1.xml
--- a/dinglehopper/tests/data/directory-test/ocr/2.xml
+++ b/dinglehopper/tests/data/directory-test/ocr/2.xml
--- a/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
+++ b/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
--- a/dinglehopper/tests/test_integ_cli_dir.py
+++ b/dinglehopper/tests/test_integ_cli_dir.py
@ -0,0 +1,41 @@
+import os
+import pytest
+from ocrd_utils import initLogging
+from dinglehopper.cli import process_dir
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_directory(tmp_path):
+    """
+    Test that the cli/process_dir() processes a directory of files and
+    yields JSON and HTML reports.
+    """
+
+    initLogging()
+    process_dir(os.path.join(data_dir, "directory-test", "gt"),
+                os.path.join(data_dir, "directory-test", "ocr"),
+                "report", str(tmp_path / "reports"), False, True,
+                "line")
+
+    assert os.path.exists(tmp_path / "reports/1.xml-report.json")
+    assert os.path.exists(tmp_path / "reports/1.xml-report.html")
+    assert os.path.exists(tmp_path / "reports/2.xml-report.json")
+    assert os.path.exists(tmp_path / "reports/2.xml-report.html")
+
+
+@pytest.mark.integration
+def test_cli_fail_without_gt(tmp_path):
+    """
+    Test that the cli/process_dir skips a file if there is no corresponding file
+    in the other directory.
+    """
+
+    initLogging()
+    process_dir(os.path.join(data_dir, "directory-test", "gt"),
+                os.path.join(data_dir, "directory-test", "ocr"),
+                "report", str(tmp_path / "reports"), False, True,
+                "line")
+
+    assert len(os.listdir(tmp_path / "reports")) == 2 * 2
--- a/dinglehopper/tests/test_integ_differences.py
+++ b/dinglehopper/tests/test_integ_differences.py
@ -0,0 +1,27 @@
+import json
+import os
+import pytest
+from ocrd_utils import initLogging
+from dinglehopper.cli import process
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_differences(tmp_path):
+    """Test that the cli/process() yields a JSON report that includes
+        the differences found between the GT and OCR text"""
+
+    initLogging()
+    process(os.path.join(data_dir, "test-gt.page2018.xml"),
+            os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
+            "report", tmp_path, differences=True)
+
+    assert os.path.exists(tmp_path / "report.json")
+
+    with open(tmp_path / "report.json", "r") as jsonf:
+        j = json.load(jsonf)
+
+        assert j["differences"] == {"character_level": {'n :: m': 1, 'ſ :: f': 1},
+                                    "word_level": {'Augenblick :: Augemblick': 1,
+                                                   'Verſprochene :: Verfprochene': 1}}
--- a/dinglehopper/tests/test_integ_summarize.py
+++ b/dinglehopper/tests/test_integ_summarize.py
@ -0,0 +1,101 @@
+import json
+import os
+import pytest
+from .util import working_directory
+from .. import cli_summarize
+
+expected_cer_avg = (0.05 + 0.10) / 2
+expected_wer_avg = (0.15 + 0.20) / 2
+expected_diff_c = {"a": 30, "b": 50}
+expected_diff_w = {"c": 70, "d": 90}
+
+
+@pytest.fixture
+def create_summaries(tmp_path):
+    """Create two summary reports with mock data"""
+    reports_dirname = tmp_path / "reports"
+    reports_dirname.mkdir()
+
+    report1 = {"cer": 0.05, "wer": 0.15,
+               "differences": {
+                   "character_level": {"a": 10, "b": 20},
+                   "word_level": {"c": 30, "d": 40}
+               }}
+    report2 = {"cer": 0.10, "wer": 0.20,
+               "differences": {
+                   "character_level": {"a": 20, "b": 30},
+                   "word_level": {"c": 40, "d": 50}
+               }}
+
+    with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
+        json.dump(report1, f)
+    with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
+        json.dump(report2, f)
+
+    return str(reports_dirname)
+
+
+@pytest.mark.integration
+def test_cli_summarize_json(tmp_path, create_summaries):
+    """Test that the cli/process() yields a summarized JSON report"""
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+        cli_summarize.process(reports_dirname)
+
+        with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
+            summary_data = json.load(f)
+
+
+        assert summary_data["num_reports"] == 2
+        assert summary_data["cer_avg"] == expected_cer_avg
+        assert summary_data["wer_avg"] == expected_wer_avg
+        assert summary_data["differences"]["character_level"] == expected_diff_c
+        assert summary_data["differences"]["word_level"] == expected_diff_w
+
+
+@pytest.mark.integration
+def test_cli_summarize_html(tmp_path, create_summaries):
+    """Test that the cli/process() yields an HTML report"""
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+        cli_summarize.process(reports_dirname)
+
+        html_file = os.path.join(reports_dirname, "summary.html")
+        assert os.path.isfile(html_file)
+
+        with open(html_file, "r") as f:
+            contents = f.read()
+
+            assert len(contents) > 0
+            assert "Number of reports: 2" in contents
+            assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
+            assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
+
+
+@pytest.mark.integration
+def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
+    """
+    Test that the cli/process() does not include reports that are missing a WER value.
+    """
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+
+        # This third report has no WER value and should not be included in the summary
+        report3 = {"cer": 0.10,
+                   "differences": {
+                       "character_level": {"a": 20, "b": 30},
+                       "word_level": {"c": 40, "d": 50}
+                   }}
+
+        with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
+            json.dump(report3, f)
+
+        cli_summarize.process(reports_dirname)
+
+        html_file = os.path.join(reports_dirname, "summary.html")
+        assert os.path.isfile(html_file)
+
+        with open(html_file, "r") as f:
+            contents = f.read()
+
+            assert "Number of reports: 2" in contents  # report3 is not included
--- a/setup.py
+++ b/setup.py
@ -27,6 +27,7 @@ setup(
            "dinglehopper=dinglehopper.cli:main",
            "dinglehopper-line-dirs=dinglehopper.cli_line_dirs:main",
            "dinglehopper-extract=dinglehopper.cli_extract:main",
+            "dinglehopper-summarize=dinglehopper.cli_summarize:main",
            "ocrd-dinglehopper=dinglehopper.ocrd_cli:ocrd_dinglehopper",
        ]
    },