Merge branch 'master' into performance

2025-07-02 23:19:58 +02:00 · 2024-01-02 20:22:38 +01:00 · 2024-01-02 20:22:38 +01:00 · 38fcbc8e1c
commit 38fcbc8e1c
parent 68a12f8f7f f077ce2e1b
101 changed files with 58154 additions and 199 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,23 +0,0 @@
-version: 2.1
-
-jobs:
-  test:
-    parameters:
-      python-version:
-        type: string
-    docker:
-      - image: cimg/python:<< parameters.python-version >>
-    steps:
-      - checkout
-      - run: pip3 install --upgrade pip
-      - run: pip3 install -r requirements.txt
-      - run: pip3 install pytest
-      - run: pytest
-
-workflows:
-  all-tests:
-    jobs:
-      - test:
-          matrix:
-            parameters:
-              python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
--- a/.editorconfig
+++ b/.editorconfig
@ -15,7 +15,7 @@ indent_size = 2

 [*.json]
 indent_size = 2
-insert_final_newline = false
+insert_final_newline = true

 # trailing spaces in markdown indicate word wrap
 [*.md]
--- a/.github/workflows/release-check-version-tag
+++ b/.github/workflows/release-check-version-tag
@ -0,0 +1,14 @@
+#!/bin/bash
+
+# We call setuptools.setup() here as we may rely on setuptools to interpret
+# a dynamic version field. (Reading pyproject.toml is not enough in that case.)
+expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
+actual_git_tag="$(git describe --tags)"
+
+if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
+  echo "OK: Python package version $expected_git_tag matches git tag"
+  exit 0
+else
+  echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
+  exit 1
+fi
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,69 @@
+name: release
+
+on:
+  push:
+    tags:
+      - "v*.*.*"
+
+env:
+  PYPI_URL: https://pypi.org/p/dinglehopper
+
+jobs:
+  test:
+    uses: ./.github/workflows/test.yml
+
+  build:
+    needs: test
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Upgrade pip
+        run: python3 -m pip install --upgrade pip
+      - name: Install setuptools
+        run: |
+          python3 -m pip install --upgrade setuptools
+          # For OCR-D tools, we need setuptools-ocrd to get the version
+          if [ -e ocrd-tool.json ]; then
+            python3 -m pip install setuptools-ocrd
+          fi
+      - name: Check git tag vs package version
+        run: .github/workflows/release-check-version-tag
+      - name: Build package
+        run: python3 -m pip install --upgrade build && python3 -m build
+      - name: Upload dist
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: dist/
+
+  github-release:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download dist
+        uses: actions/download-artifact@v3
+        with:
+          name: dist
+          path: dist/
+      - name: Create release on GitHub
+        uses: softprops/action-gh-release@v1
+        with:
+          files: dist/*
+
+  pypi-publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: ${{ env.PYPI_URL }}
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
+    steps:
+      - name: Download dist
+        uses: actions/download-artifact@v3
+        with:
+          name: dist
+          path: dist/
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,76 @@
+name: test
+
+on:
+
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+  schedule:
+    - cron: "00 16 07 * *"  # = monthly
+
+  # Allow manually running (from GitHub Web)
+  workflow_dispatch:
+
+  # Allow calling this workflow (e.g. from release workflow)
+  workflow_call:
+
+jobs:
+  test:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
+
+    # For Python 3.6, we need to fall back to Ubuntu 20.04
+    runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
+
+    env:
+      test_results_dir: test-results-${{ matrix.python-version }}
+
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Update pip
+        run: python3 -m pip install -U pip
+      - name: Avoid compiling OpenCV and NumPy on Python 3.6
+        run: |
+          if python3 --version | grep -q "Python 3.6"; then
+             pip install --prefer-binary -U opencv-python-headless numpy
+          fi
+      - name: Install requirements*.txt
+        run: |
+          for requirements_txt in requirements*.txt; do
+            python3 -m pip install -r $requirements_txt;
+          done
+
+      - name: Test
+        run: |
+            cd src
+            mkdir -p ../$test_results_dir
+            python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
+      - name: Upload test results
+        uses: actions/upload-artifact@v3
+        if: success() || failure()
+        with:
+          name: ${{ env.test_results_dir }}
+          path: ${{ env.test_results_dir }}
+
+      - name: Report tests
+        uses: dorny/test-reporter@v1
+        if: success() || failure()
+        with:
+          name: Results on Python ${{ matrix.python-version }}
+          path: "${{env.test_results_dir }}/junit.xml"
+          reporter: java-junit
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@ htmlcov/
 .venv
 env/
 venv/
+.python-version

 # mypy
 .mypy_cache/
@ -27,3 +28,4 @@ dmypy.json

 # Build artifacts
 /build
+/dist
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,36 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-json
+    -   id: check-toml
+    -   id: check-yaml
+    -   id: check-added-large-files
+    -   id: check-ast
+
+-   repo: https://github.com/psf/black
+    rev: 23.10.0
+    hooks:
+    -   id: black
+
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.1
+    hooks:
+    -   args:
+        - --fix
+        - --exit-non-zero-on-fix
+        id: ruff
+
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.6.1
+    hooks:
+    -   additional_dependencies:
+        - types-setuptools
+        id: mypy
+
+-   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
+    rev: v0.1.0
+    hooks:
+    -   id: pre-commit-update
--- a/README-DEV.md
+++ b/README-DEV.md
@ -1,6 +1,6 @@
 Testing
 =======
-Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
+Use `pytest` to run the tests in [the tests directory](dinglehopper/tests):
 ```bash
 virtualenv -p /usr/bin/python3 venv
 . venv/bin/activate
@ -10,6 +10,7 @@ pytest
 ```

 ## Test running examples
+
 Only unit tests:
 ```bash
 pytest -m "not integration"
@ -27,11 +28,18 @@ pytest

 All tests with code coverage:
 ```bash
-pytest --cov=qurator --cov-report=html
+pytest --cov=dinglehopper --cov-report=html
 ```

 Static code analysis:
 ```bash
-pytest -k "not test" --flake8
 pytest -k "not test" --mypy
+pytest -k "not test" --ruff
 ```
+
+# How to use pre-commit
+
+This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
+
+- Install pre-commit, e.g. `pip install -r requirements-dev.txt`
+- Install the repo-local git hooks: `pre-commit install`
--- a/README.md
+++ b/README.md
@ -5,9 +5,13 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report.
+metrics and a word/character differences report. It also supports batch processing by
+generating, aggregating and summarizing multiple reports.

-[![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper)
+[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
+[![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
+[![License](https://img.shields.io/badge/License-Apache-blue)](#license)
+[![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)

 Goals
 -----
@ -19,15 +23,16 @@ Goals

 Installation
 ------------
-It's best to use pip, e.g.:
-~~~
-sudo pip install .
-~~~
+
+It's best to use pip to install the package from PyPI, e.g.:
+```
+pip install dinglehopper
+```

 Usage
 -----
 ~~~
-Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
+Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]

  Compare the PAGE/ALTO/text document GT against the document OCR.

@ -35,19 +40,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  their text and falls back to plain text if no ALTO or PAGE is detected.

  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
-  In that case, use --no-metrics to disable the then meaningless metrics and
-  also change the color scheme from green/red to blue.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
+  that case, use --no-metrics to disable the then meaningless metrics and also
+  change the color scheme from green/red to blue.

-  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
-  error rate (CER) and the word error rate (WER).
+  The comparison report will be written to
+  $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
+  to the current working directory and $REPORT_PREFIX defaults to "report".
+  The reports include the character error rate (CER) and the word error rate
+  (WER).

  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.

 Options:
  --metrics / --no-metrics  Enable/disable metrics and green/red
+  --differences BOOLEAN     Enable reporting character and word level
+                            differences
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --progress                Show progress bar
  --help                    Show this message and exit.
@ -61,6 +70,43 @@ This generates `report.html` and `report.json`.

 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)

+Batch comparison between folders of GT and OCR files can be done by simply providing
+folders:
+~~~
+dinglehopper gt/ ocr/ report output_folder/
+~~~
+This assumes that you have files with the same name in both folders, e.g.
+`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
+
+The example generates reports for each set of files, with the prefix `report`, in the
+(automatically created) folder `output_folder/`.
+
+By default, the JSON report does not contain the character and word differences, only
+the calculated metrics. If you want to include the differences, use the
+`--differences` flag:
+
+~~~
+dinglehopper gt/ ocr/ report output_folder/ --differences
+~~~
+
+### dinglehopper-summarize
+A set of (JSON) reports can be summarized into a single set of
+reports. This is useful after having generated reports in batch.
+Example:
+~~~
+dinglehopper-summarize output_folder/
+~~~
+This generates `summary.html` and `summary.json` in the same `output_folder`.
+
+If you are summarizing many reports and have used the `--differences` flag while
+generating them, it may be useful to limit the number of differences reported by using
+the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
+report, making it easier to open and navigate. Note that the JSON report will still
+contain all differences. Example:
+~~~
+dinglehopper-summarize output_folder/ --occurences-threshold 10
+~~~
+
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
--- a/ocrd-tool.json
+++ b/ocrd-tool.json
@ -1 +1 @@
-qurator/dinglehopper/ocrd-tool.json
+src/dinglehopper/ocrd-tool.json
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,70 @@
+[build-system]
+requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
+
+[project]
+name = "dinglehopper"
+authors = [
+    {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
+    {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
+]
+description = "The OCR evaluation tool"
+readme = "README.md"
+requires-python = ">=3.6"
+keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
+
+dynamic = ["version", "dependencies", "optional-dependencies"]
+
+# https://pypi.org/classifiers/
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Other Audience",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Topic :: Text Processing",
+]
+
+[project.scripts]
+dinglehopper = "dinglehopper.cli:main"
+dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
+dinglehopper-extract = "dinglehopper.cli_extract:main"
+dinglehopper-summarize = "dinglehopper.cli_summarize:main"
+ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
+
+
+[project.urls]
+Homepage = "https://github.com/qurator-spk/dinglehopper"
+Repository = "https://github.com/qurator-spk/dinglehopper.git"
+
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+optional-dependencies.dev = {file = ["requirements-dev.txt"]}
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+dinglehopper = ["templates/*"]
+
+
+[tool.pytest.ini_options]
+minversion = 6.0
+addopts = "--strict-markers"
+markers = [
+    "integration: integration tests",
+]
+
+
+[tool.mypy]
+ignore_missing_imports = true
+
+
+[tool.ruff]
+select = ["E", "F", "I"]
+ignore = [
+    "F811",  # multimethods are considered redefinitions by ruff
+]
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +0,0 @@
-[pytest]
-markers =
-    integration: integration tests
-    serial
--- a/qurator/init.py
+++ b/qurator/init.py
@ -1 +0,0 @@
-__import__("pkg_resources").declare_namespace(__name__)
--- a/qurator/dinglehopper/init.py
+++ b/qurator/dinglehopper/init.py
@ -1,5 +0,0 @@
-from .ocr_files import *
-from .extracted_text import *
-from .character_error_rate import *
-from .word_error_rate import *
-from .align import *
--- a/qurator/dinglehopper/templates/report.html.js
+++ b/qurator/dinglehopper/templates/report.html.js
@ -1,15 +0,0 @@
-function find_diff_class(classes) {
-    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
-}
-
-$(document).ready(function() {
-    /* Enable Bootstrap tooltips */
-    $('[data-toggle="tooltip"]').tooltip();
-
-    $('.diff').mouseover(function() {
-        find_diff_class($(this).attr('class')).addClass('diff-highlight');
-    });
-    $('.diff').mouseout(function() {
-        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
-    });
-});
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,5 +1,8 @@
 pytest
-pytest-flake8
 pytest-cov
 pytest-mypy
 black
+pre-commit
+
+ruff ; python_version >= "3.7"
+pytest-ruff ; python_version >= "3.7"
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ attrs
 multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.7.0
-six  # XXX workaround OCR-D/core#730
+chardet
--- a/setup.cfg
+++ b/setup.cfg
@ -1,12 +0,0 @@
-[flake8]
-max-line-length = 88
-extend-ignore = E203, W503
-
-[pylint]
-max-line-length = 88
-
-[pylint.messages_control]
-disable = C0330, C0326
-
-[mypy]
-ignore_missing_imports = True
--- a/setup.py
+++ b/setup.py
@ -1,34 +0,0 @@
-from io import open
-from setuptools import find_packages, setup
-
-with open("requirements.txt") as fp:
-    install_requires = fp.read()
-
-with open('requirements-dev.txt') as fp:
-    tests_require = fp.read()
-
-setup(
-    name="dinglehopper",
-    author="Mike Gerber, The QURATOR SPK Team",
-    author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
-    description="The OCR evaluation tool",
-    long_description=open("README.md", "r", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    keywords="qurator ocr",
-    license="Apache",
-    namespace_packages=["qurator"],
-    packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
-    install_requires=install_requires,
-    tests_require=tests_require,
-    package_data={
-        "": ["*.json", "templates/*"],
-    },
-    entry_points={
-        "console_scripts": [
-            "dinglehopper=qurator.dinglehopper.cli:main",
-            "dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
-            "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
-            "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
-        ]
-    },
-)
--- a/src/dinglehopper/init.py
+++ b/src/dinglehopper/init.py
@ -0,0 +1,33 @@
+from .align import align, score_hint, seq_align
+from .character_error_rate import character_error_rate, character_error_rate_n
+from .edit_distance import distance, editops
+from .extracted_text import ExtractedText
+from .ocr_files import (
+    alto_namespace,
+    alto_text,
+    page_namespace,
+    page_text,
+    plain_text,
+    text,
+)
+from .word_error_rate import word_error_rate, word_error_rate_n, words
+
+__all__ = [
+    "editops",
+    "distance",
+    "align",
+    "score_hint",
+    "seq_align",
+    "character_error_rate",
+    "character_error_rate_n",
+    "word_error_rate",
+    "word_error_rate_n",
+    "words",
+    "ExtractedText",
+    "alto_namespace",
+    "alto_text",
+    "page_namespace",
+    "page_text",
+    "plain_text",
+    "text",
+]
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,9 +1,12 @@
 import math
+import unicodedata
 from math import ceil

-from .edit_distance import *
 from rapidfuzz.distance import Levenshtein

+from .edit_distance import grapheme_clusters
+
+
 def align(t1, t2):
    """Align text."""
    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -1,20 +1,22 @@
 import os
+from collections import Counter

 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from ocrd_utils import initLogging
-from math import ceil

-from .character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align, score_hint
-from .extracted_text import ExtractedText
-from .ocr_files import extract
-from .config import Config
+from dinglehopper.align import score_hint, seq_align
+from dinglehopper.character_error_rate import character_error_rate_n
+from dinglehopper.config import Config
+from dinglehopper.extracted_text import ExtractedText
+from dinglehopper.ocr_files import extract
+from dinglehopper.word_error_rate import word_error_rate_n, words_normalized


-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
+def gen_diff_report(
+    gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
+):
    gtx = ""
    ocrx = ""

@ -31,16 +33,12 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):

        # Set Bootstrap tooltip to the segment id
        if id_:
-            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
+            html_custom_attrs += f'data-toggle="tooltip" title="{id_}"'

        if css_classes:
-            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
-                css_classes=css_classes,
-                html_t=html_t,
-                html_custom_attrs=html_custom_attrs,
-            )
+            return f'<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'
        else:
-            return "{html_t}".format(html_t=html_t)
+            return f"{html_t}"

    if isinstance(gt_in, ExtractedText):
        if not isinstance(ocr_in, ExtractedText):
@ -53,6 +51,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):

    g_pos = 0
    o_pos = 0
+    found_differences = []
+
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
        css_classes = None
        gt_id = None
@ -65,6 +65,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
                # Deletions and inserts only produce one id + None, UI must
                # support this, i.e. display for the one id produced

+            if differences:
+                found_differences.append(f"{g} :: {o}")
+
        gtx += joiner + format_thing(g, css_classes, gt_id)
        ocrx += joiner + format_thing(o, css_classes, ocr_id)

@ -73,13 +76,18 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
        if o is not None:
            o_pos += len(o)

-    return """
+    found_differences = dict(Counter(elem for elem in found_differences))
+
+    return (
+        """
        <div class="row">
           <div class="col-md-6 gt">{}</div>
           <div class="col-md-6 ocr">{}</div>
        </div>
        """.format(
            gtx, ocrx
+        ),
+        found_differences,
    )


@ -96,11 +104,20 @@ def json_float(value):
        return str(value)


-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(
+    gt,
+    ocr,
+    report_prefix,
+    reports_folder=".",
+    *,
+    metrics=True,
+    differences=False,
+    textequiv_level="region",
+):
    """Check OCR result against GT.

-    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
-    Click on a wrapper.
+    The @click decorators change the signature of the decorated functions, so we keep
+    this undecorated version and use Click on a wrapper.
    """

    gt_text = extract(gt, textequiv_level=textequiv_level)
@ -109,15 +126,25 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    ocr_words = words_normalized(ocr_text)

    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·",
-        score_hint=score_hint(cer, n_characters)
+    char_diff_report, diff_c = gen_diff_report(
+        gt_text,
+        ocr_text,
+        css_prefix="c",
+        joiner="",
+        none="·",
+        score_hint=score_hint(cer, n_characters),
+        differences=differences,
    )

    wer, n_words = word_error_rate_n(gt_words, ocr_words)
-    word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
-        score_hint=score_hint(wer, n_words)
+    word_diff_report, diff_w = gen_diff_report(
+        gt_words,
+        ocr_words,
+        css_prefix="w",
+        joiner=" ",
+        none="⋯",
+        score_hint=score_hint(wer, n_words),
+        differences=differences,
    )

    env = Environment(
@ -129,7 +156,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):

    for report_suffix in (".html", ".json"):
        template_fn = "report" + report_suffix + ".j2"
-        out_fn = report_prefix + report_suffix
+
+        if not os.path.isdir(reports_folder):
+            os.mkdir(reports_folder)
+
+        out_fn = os.path.join(reports_folder, report_prefix + report_suffix)

        template = env.get_template(template_fn)
        template.stream(
@ -142,16 +173,46 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
+            differences=differences,
+            diff_c=diff_c,
+            diff_w=diff_w,
        ).dump(out_fn)


+def process_dir(
+    gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
+):
+    for gt_file in os.listdir(gt):
+        gt_file_path = os.path.join(gt, gt_file)
+        ocr_file_path = os.path.join(ocr, gt_file)
+
+        if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
+            process(
+                gt_file_path,
+                ocr_file_path,
+                f"{gt_file}-{report_prefix}",
+                reports_folder=reports_folder,
+                metrics=metrics,
+                differences=differences,
+                textequiv_level=textequiv_level,
+            )
+        else:
+            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
+
+
@click.command()
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
+@click.argument("reports_folder", type=click.Path(), default=".")
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
+@click.option(
+    "--differences",
+    default=False,
+    help="Enable reporting character and word level differences",
+)
@click.option(
    "--textequiv-level",
    default="region",
@ -159,7 +220,16 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    metavar="LEVEL",
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(
+    gt,
+    ocr,
+    report_prefix,
+    reports_folder,
+    metrics,
+    differences,
+    textequiv_level,
+    progress,
+):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.

@ -171,7 +241,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.

-    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
+    where $REPORTS_FOLDER defaults to the current working directory and
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

@ -180,7 +251,31 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    initLogging()
    Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    if os.path.isdir(gt):
+        if not os.path.isdir(ocr):
+            raise click.BadParameter(
+                "OCR must be a directory if GT is a directory", param_hint="ocr"
+            )
+        else:
+            process_dir(
+                gt,
+                ocr,
+                report_prefix,
+                reports_folder,
+                metrics,
+                differences,
+                textequiv_level,
+            )
+    else:
+        process(
+            gt,
+            ocr,
+            report_prefix,
+            reports_folder,
+            metrics=metrics,
+            differences=differences,
+            textequiv_level=textequiv_level,
+        )


 if __name__ == "__main__":
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@ -1,15 +1,15 @@
-import os
 import itertools
+import os

 import click
 from jinja2 import Environment, FileSystemLoader
 from ocrd_utils import initLogging
-from math import ceil

+from .align import score_hint
 from .character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
-from .ocr_files import plain_extract
 from .cli import gen_diff_report, json_float
+from .ocr_files import plain_extract
+from .word_error_rate import word_error_rate_n, words_normalized


 def all_equal(iterable):
@ -75,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):

        # Generate diff reports
        char_diff_report += gen_diff_report(
-            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
-            score_hint=score_hint(l_cer, l_n_characters)
+            gt_text,
+            ocr_text,
+            css_prefix="l{0}-c".format(k),
+            joiner="",
+            none="·",
+            score_hint=score_hint(l_cer, l_n_characters),
        )
        word_diff_report += gen_diff_report(
-            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
-            score_hint=score_hint(l_wer, l_n_words)
+            gt_words,
+            ocr_words,
+            css_prefix="l{0}-w".format(k),
+            joiner=" ",
+            none="⋯",
+            score_hint=score_hint(l_wer, l_n_words),
        )

    env = Environment(
--- a/src/dinglehopper/cli_summarize.py
+++ b/src/dinglehopper/cli_summarize.py
@ -0,0 +1,106 @@
+import json
+import os
+
+import click
+from jinja2 import Environment, FileSystemLoader
+from ocrd_utils import initLogging
+
+from dinglehopper.cli import json_float
+
+
+def process(reports_folder, occurrences_threshold=1):
+    cer_list = []
+    wer_list = []
+    cer_sum = 0
+    wer_sum = 0
+    diff_c = {}
+    diff_w = {}
+
+    for report in os.listdir(reports_folder):
+        if report.endswith(".json"):
+            with open(os.path.join(reports_folder, report), "r") as f:
+                report_data = json.load(f)
+
+                if "cer" not in report_data or "wer" not in report_data:
+                    click.echo(
+                        f"Skipping {report} because it does not contain CER and WER"
+                    )
+                    continue
+
+                cer = report_data["cer"]
+                wer = report_data["wer"]
+                cer_list.append(cer)
+                wer_list.append(wer)
+                cer_sum += cer
+                wer_sum += wer
+
+                try:
+                    for key, value in report_data["differences"][
+                        "character_level"
+                    ].items():
+                        diff_c[key] = diff_c.get(key, 0) + value
+                    for key, value in report_data["differences"]["word_level"].items():
+                        diff_w[key] = diff_w.get(key, 0) + value
+                except KeyError:
+                    pass
+
+    if len(cer_list) == 0:
+        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
+        return
+
+    cer_avg = cer_sum / len(cer_list)
+    wer_avg = wer_sum / len(wer_list)
+
+    print(f"Number of reports: {len(cer_list)}")
+    print(f"Average CER: {cer_avg}")
+    print(f"Average WER: {wer_avg}")
+    print(f"Sum of common mistakes: {cer_sum}")
+    print(f"Sum of common mistakes: {wer_sum}")
+
+    env = Environment(
+        loader=FileSystemLoader(
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+        )
+    )
+    env.filters["json_float"] = json_float
+    for report_suffix in (".html", ".json"):
+        template_fn = "summary" + report_suffix + ".j2"
+
+        out_fn = os.path.join(reports_folder, "summary" + report_suffix)
+        template = env.get_template(template_fn)
+        template.stream(
+            num_reports=len(cer_list),
+            cer_avg=cer_avg,
+            wer_avg=wer_avg,
+            diff_c=diff_c,
+            diff_w=diff_w,
+            occurrences_threshold=occurrences_threshold,
+        ).dump(out_fn)
+
+
+@click.command()
+@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
+@click.option(
+    "--occurrences-threshold",
+    type=int,
+    default=1,
+    help="Only show differences that occur at least this many times.",
+)
+def main(reports_folder, occurrences_threshold):
+    """
+    Summarize the results from multiple reports generated earlier by dinglehopper.
+    It calculates the average CER and WER, as well as a sum of common mistakes.
+    Reports include lists of mistakes and their occurrences.
+
+    You may use a threshold to reduce the file size of the HTML report by only showing
+    mistakes whose number of occurrences is above the threshold. The JSON report will
+    always contain all mistakes.
+
+    All JSON files in the provided folder will be gathered and summarized.
+    """
+    initLogging()
+    process(reports_folder, occurrences_threshold)
+
+
+if __name__ == "__main__":
+    main()
--- a/qurator/dinglehopper/config.py
+++ b/qurator/dinglehopper/config.py
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,8 +1,8 @@
 import unicodedata

 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
+from uniseg.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText

--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
--- a/qurator/dinglehopper/notebooks/Unicode
+++ b/qurator/dinglehopper/notebooks/Unicode
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -2,6 +2,7 @@ import os
 import sys
 from typing import Iterator

+import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
 from uniseg.graphemecluster import grapheme_clusters
@ -12,8 +13,8 @@ from .extracted_text import ExtractedText, normalize_sbb
 def alto_namespace(tree: ET.ElementTree) -> str:
    """Return the ALTO namespace used in the given ElementTree.

-    This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
-    check if the files uses any valid ALTO namespace.
+    This relies on the assumption that, in any given ALTO file, the root element has the
+    local name "alto". We do not check if the files uses any valid ALTO namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "alto":
@ -48,8 +49,9 @@ def alto_text(tree):
 def page_namespace(tree):
    """Return the PAGE content namespace used in the given ElementTree.

-    This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
-    do not check if the files uses any valid PAGE namespace.
+    This relies on the assumption that, in any given PAGE content file, the root element
+    has the local name "PcGts". We do not check if the files uses any valid PAGE
+    namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "PcGts":
@ -135,6 +137,10 @@ def page_text(tree, *, textequiv_level="region"):
    return page_extract(tree, textequiv_level=textequiv_level).text


+def detect_encoding(filename):
+    return chardet.detect(open(filename, "rb").read(1024))["encoding"]
+
+
 def plain_extract(filename, include_filename_in_id=False):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"

@ -149,7 +155,8 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )

-    with open(filename, "r") as f:
+    fileencoding = detect_encoding(filename)
+    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
            [make_segment(no, line) for no, line in enumerate(f.readlines())],
@ -171,7 +178,7 @@ def extract(filename, *, textequiv_level="region"):
    """
    try:
        tree = ET.parse(filename)
-    except XMLSyntaxError:
+    except (XMLSyntaxError, UnicodeDecodeError):
        return plain_extract(filename)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
--- a/qurator/dinglehopper/ocrd-tool.json
+++ b/qurator/dinglehopper/ocrd-tool.json
@ -1,4 +1,5 @@
 {
+  "version": "0.9.4",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -4,7 +4,7 @@ import os
 import click
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
+from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
 from pkg_resources import resource_string

 from .cli import process as cli_process
--- a/qurator/dinglehopper/templates/report.html.j2
+++ b/qurator/dinglehopper/templates/report.html.j2
@ -26,6 +26,22 @@
      border: 2px solid;
      border-radius: 5px;
    }
+
+    .row {
+        margin-bottom: 20px;
+    }
+
+    table {
+        width: 100%;
+    }
+
+    th {
+        cursor: pointer;
+    }
+
+    th:hover {
+        background-color: #eee;
+    }
    </style>
 </head>
 <body>
@ -50,6 +66,32 @@
 <h2>Word differences</h2>
 {{ word_diff_report }}

+{%- if differences %}
+{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
+
+<div class="row">
+{% for section in sections %}
+    <div class="col-md-6">
+        <h2>{{ section['title'] }}</h2>
+        <table>
+            <thead>
+            <tr>
+                <th>GT</th>
+                <th>OCR</th>
+                <th>Occurrences</th>
+            </tr>
+            {% for gt_ocr, occurrences in section['data'].items() %}
+                <tr>
+                    <td>{{ gt_ocr.split("::")[0] }}</td>
+                    <td>{{ gt_ocr.split("::")[1] }}</td>
+                    <td>{{ occurrences }}</td>
+                </tr>
+            {% endfor %}
+        </table>
+    </div>
+{% endfor %}
+</div>
+{%- endif %}

 </div>

--- a/src/dinglehopper/templates/report.html.js
+++ b/src/dinglehopper/templates/report.html.js
@ -0,0 +1,39 @@
+function find_diff_class(classes) {
+    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
+}
+
+$(document).ready(function() {
+    /* Enable Bootstrap tooltips */
+    $('[data-toggle="tooltip"]').tooltip();
+
+    $('.diff').mouseover(function() {
+        find_diff_class($(this).attr('class')).addClass('diff-highlight');
+    });
+    $('.diff').mouseout(function() {
+        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
+    });
+
+    /* Sort this column of the table */
+    $('th').click(function () {
+        var table = $(this).closest('table');
+        var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
+        this.asc = !this.asc;
+        if (!this.asc) {
+            rows = rows.reverse();
+        }
+        for (var i = 0; i < rows.length; i++) {
+            table.children('tbody').append(rows[i]);
+        }
+    });
+
+    function compareRows(index) {
+        return function (row1, row2) {
+            var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
+            var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
+            return cell1.localeCompare(cell2, undefined, {
+                numeric: true,
+                sensitivity: 'base'
+            });
+        }
+    }
+});
--- a/qurator/dinglehopper/templates/report.json.j2
+++ b/qurator/dinglehopper/templates/report.json.j2
@ -4,6 +4,12 @@
 {% if metrics %}
    "cer": {{ cer|json_float }},
    "wer": {{ wer|json_float }},
+{% endif %}
+{% if differences %}
+    "differences": {
+        "character_level": {{ diff_c|tojson }},
+        "word_level": {{ diff_w|tojson }}
+    },
 {% endif %}
    "n_characters": {{ n_characters }},
    "n_words": {{ n_words }}
--- a/src/dinglehopper/templates/summary.html.j2
+++ b/src/dinglehopper/templates/summary.html.j2
@ -0,0 +1,136 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
+    <style type="text/css">
+    {% if metrics %}
+    .gt .diff {
+        color: green;
+    }
+    .ocr .diff {
+        color: red;
+    }
+    {% else %}
+    .gt .diff, .ocr .diff {
+        color: blue;
+    }
+    {% endif %}
+    .ellipsis {
+        opacity: 0.5;
+        font-style: italic;
+    }
+    .diff-highlight {
+      border: 2px solid;
+      border-radius: 5px;
+    }
+
+    .row {
+        margin-bottom: 20px;
+    }
+
+    table {
+        width: 100%;
+    }
+
+    .cer {
+        flex-direction: column;
+    }
+
+    tr:hover {
+        background-color: #f5f5f5;
+    }
+
+    th {
+        cursor: pointer;
+    }
+
+    th:hover {
+        background-color: #eee;
+    }
+
+    td {
+        min-width: 100px;
+    }
+
+    td:hover {
+        background-color: #eee;
+    }
+    </style>
+</head>
+<body>
+
+<div class="container">
+
+<div class="row">
+    <h1>Summary of all reports</h1>
+</div>
+
+<div class="row">
+    <p>Number of reports: {{ num_reports }}</p>
+</div>
+
+{% if cer_avg and wer_avg -%}
+<div class="row">
+    <h2>Metrics</h2>
+</div>
+
+<div class="row cer">
+    <p>Average CER: {{ cer_avg|round(4) }}</p>
+    <p>Average WER: {{ wer_avg|round(4) }}</p>
+</div>
+{% endif %}
+
+{%- if diff_c and diff_w %}
+{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
+
+<div class="row">
+{%- for section in sections %}
+    <div class="col-md-6">
+        <h2>{{ section['title'] }}</h2>
+        <table>
+            <thead>
+            <tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
+            </thead>
+            {%- set num_omitted = namespace(value=0) -%}
+            {% for gt_ocr, occurrences in section['data'].items() -%}
+                {% if occurrences < occurrences_threshold -%}
+                    {%- set num_omitted.value = num_omitted.value + 1 %}
+                {%- else -%}
+                    {%- set gt = gt_ocr.split(" :: ")[0] %}
+                    {%- set ocr = gt_ocr.split(" :: ")[1] %}
+                    <tr>
+                        <td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
+                        <td title="{{ ocr|urlencode }}">{{ ocr }}</td >
+                        <td>{{ occurrences }}</td>
+                    </tr>
+                {%- endif %}
+            {%- endfor %}
+
+            {% if num_omitted.value > 0  and occurrences_threshold > 1 -%}
+                <p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
+                {%- set num_omitted.value = 0 %}
+            {%- endif %}
+        </table>
+    </div>
+{%- endfor %}
+</div>
+{%- endif %}
+
+</div>
+
+
+
+<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
+<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
+
+<script>
+{% include 'report.html.js' %}
+</script>
+
+
+</body>
+</html>
--- a/src/dinglehopper/templates/summary.json.j2
+++ b/src/dinglehopper/templates/summary.json.j2
@ -0,0 +1,15 @@
+{
+"num_reports": {{ num_reports}}
+{%- if cer_avg and wer_avg %}
+    ,
+    "cer_avg": {{ cer_avg|json_float }},
+    "wer_avg": {{ wer_avg|json_float }}
+{%- endif %}
+{%- if diff_c and wer_avg %}
+    ,
+    "differences": {
+        "character_level": {{ diff_c|tojson }},
+        "word_level": {{ diff_w|tojson }}
+    }
+{%- endif %}
+}
--- a/qurator/dinglehopper/tests/init.py
+++ b/qurator/dinglehopper/tests/init.py
--- a/qurator/dinglehopper/tests/data/00000119.tif
+++ b/qurator/dinglehopper/tests/data/00000119.tif
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
--- a/src/dinglehopper/tests/data/bigger-texts/00008228/00008228-00236534.gt4hist.xml
+++ b/src/dinglehopper/tests/data/bigger-texts/00008228/00008228-00236534.gt4hist.xml
--- a/src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
+++ b/src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
--- a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
--- a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml
+++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml
--- a/qurator/dinglehopper/tests/data/test-gt.page2018.xml
+++ b/qurator/dinglehopper/tests/data/test-gt.page2018.xml
--- a/src/dinglehopper/tests/data/directory-test/gt/2.xml
+++ b/src/dinglehopper/tests/data/directory-test/gt/2.xml
--- a/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
+++ b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
--- a/src/dinglehopper/tests/data/directory-test/ocr/2.xml
+++ b/src/dinglehopper/tests/data/directory-test/ocr/2.xml
--- a/src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
+++ b/src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
--- a/qurator/dinglehopper/tests/data/levels-are-different.page.xml
+++ b/qurator/dinglehopper/tests/data/levels-are-different.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
--- a/qurator/dinglehopper/tests/data/mixed-regions.page.xml
+++ b/qurator/dinglehopper/tests/data/mixed-regions.page.xml
--- a/qurator/dinglehopper/tests/data/order.page.xml
+++ b/qurator/dinglehopper/tests/data/order.page.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-region.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-region.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
--- a/src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
+++ b/src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
--- a/src/dinglehopper/tests/data/test-gt.page2018.xml
+++ b/src/dinglehopper/tests/data/test-gt.page2018.xml
--- a/qurator/dinglehopper/tests/data/test.alto1.xml
+++ b/qurator/dinglehopper/tests/data/test.alto1.xml
--- a/qurator/dinglehopper/tests/data/test.alto2.xml
+++ b/qurator/dinglehopper/tests/data/test.alto2.xml
--- a/qurator/dinglehopper/tests/data/test.alto3.xml
+++ b/qurator/dinglehopper/tests/data/test.alto3.xml
--- a/src/dinglehopper/tests/data/test.page2018.xml
+++ b/src/dinglehopper/tests/data/test.page2018.xml
--- a/qurator/dinglehopper/tests/data/test.txt
+++ b/qurator/dinglehopper/tests/data/test.txt
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/462875_0008.jpg
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/462875_0008.jpg
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-GT_0008.xml
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-GT_0008.xml
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-OCR-TESS_0008.xml
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-OCR-TESS_0008.xml
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -6,7 +6,7 @@ import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters

-from .. import seq_align, ExtractedText
+from .. import ExtractedText, seq_align


 def test_text():
@ -30,12 +30,20 @@ def test_text():

 def test_normalization_check():
    with pytest.raises(ValueError, match=r".*is not in NFC.*"):
-        ExtractedText("foo", None, None,
+        ExtractedText(
+            "foo",
+            None,
+            None,
            unicodedata.normalize("NFD", "Schlyñ"),
-                      grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")))
-    assert ExtractedText("foo", None, None,
+            grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
+        )
+    assert ExtractedText(
+        "foo",
+        None,
+        None,
        unicodedata.normalize("NFC", "Schlyñ"),
-                         grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")))
+        grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
+    )


 AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@ -1,7 +1,9 @@
 import math
+
 import pytest
+
+from .. import align, distance, score_hint, seq_align
 from .util import unzip
-from .. import align, seq_align, distance, score_hint


 def test_left_empty():
@ -72,7 +74,8 @@ def test_with_some_fake_ocr_errors():
    result = list(
        align(
            "Über die vielen Sorgen wegen desselben vergaß",
-            "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
+            "SomeJunk MoreJunk "
+            + "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
        )
    )
    left, right = unzip(result)
@ -183,6 +186,7 @@ def test_lines_similar():
    # Test __eq__ (i.e. is it a substitution or a similar string?)
    assert list(left)[0] == list(right)[0]

+
 def test_score_hint():
    assert score_hint(0.5, 23) == 12  # int(ceil())
    assert score_hint(math.inf, 12345) is None
--- a/qurator/dinglehopper/tests/test_character_error_rate.py
+++ b/qurator/dinglehopper/tests/test_character_error_rate.py
@ -36,6 +36,7 @@ def test_character_error_rate_hard():
        len(s2) == 7
    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points

-    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
+    # Both strings have the same length in terms of grapheme clusters. So the CER should
+    # be symmetrical.
    assert character_error_rate(s2, s1) == 1 / 6
    assert character_error_rate(s1, s2) == 1 / 6
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@ -15,7 +15,9 @@ def test_align_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    # → 2 elements in the alignment should be different, the ligature is
    # (currently) not counted due to normalization.
-    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
+    #
+    # NOTE: In this example, it doesn't matter that we work with "characters", not
+    # grapheme clusters.

    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
--- a/src/dinglehopper/tests/test_integ_bigger_texts.py
+++ b/src/dinglehopper/tests/test_integ_bigger_texts.py
@ -0,0 +1,28 @@
+from __future__ import division, print_function
+
+import os
+
+import pytest
+from lxml import etree as ET
+
+from .. import alto_text, character_error_rate, page_text
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_bigger_texts():
+    gt = page_text(
+        ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
+            )
+        )
+    )
+
+    # Only interested in a result here: In earlier versions this would have used
+    # tens of GB of RAM and should now not break a sweat.
+    assert character_error_rate(gt, ocr) >= 0.0
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@ -6,7 +6,7 @@ import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters

-from .. import character_error_rate, page_text, alto_text
+from .. import alto_text, character_error_rate, page_text

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

--- a/src/dinglehopper/tests/test_integ_cli_dir.py
+++ b/src/dinglehopper/tests/test_integ_cli_dir.py
@ -0,0 +1,53 @@
+import os
+
+import pytest
+from ocrd_utils import initLogging
+
+from dinglehopper.cli import process_dir
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_directory(tmp_path):
+    """
+    Test that the cli/process_dir() processes a directory of files and
+    yields JSON and HTML reports.
+    """
+
+    initLogging()
+    process_dir(
+        os.path.join(data_dir, "directory-test", "gt"),
+        os.path.join(data_dir, "directory-test", "ocr"),
+        "report",
+        str(tmp_path / "reports"),
+        False,
+        True,
+        "line",
+    )
+
+    assert os.path.exists(tmp_path / "reports/1.xml-report.json")
+    assert os.path.exists(tmp_path / "reports/1.xml-report.html")
+    assert os.path.exists(tmp_path / "reports/2.xml-report.json")
+    assert os.path.exists(tmp_path / "reports/2.xml-report.html")
+
+
+@pytest.mark.integration
+def test_cli_fail_without_gt(tmp_path):
+    """
+    Test that the cli/process_dir skips a file if there is no corresponding file
+    in the other directory.
+    """
+
+    initLogging()
+    process_dir(
+        os.path.join(data_dir, "directory-test", "gt"),
+        os.path.join(data_dir, "directory-test", "ocr"),
+        "report",
+        str(tmp_path / "reports"),
+        False,
+        True,
+        "line",
+    )
+
+    assert len(os.listdir(tmp_path / "reports")) == 2 * 2
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -1,9 +1,9 @@
 import json

 import pytest
-from .util import working_directory

 from ..cli import process
+from .util import working_directory


@pytest.mark.integration
--- a/src/dinglehopper/tests/test_integ_differences.py
+++ b/src/dinglehopper/tests/test_integ_differences.py
@ -0,0 +1,37 @@
+import json
+import os
+
+import pytest
+from ocrd_utils import initLogging
+
+from dinglehopper.cli import process
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_differences(tmp_path):
+    """Test that the cli/process() yields a JSON report that includes
+    the differences found between the GT and OCR text"""
+
+    initLogging()
+    process(
+        os.path.join(data_dir, "test-gt.page2018.xml"),
+        os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
+        "report",
+        tmp_path,
+        differences=True,
+    )
+
+    assert os.path.exists(tmp_path / "report.json")
+
+    with open(tmp_path / "report.json", "r") as jsonf:
+        j = json.load(jsonf)
+
+        assert j["differences"] == {
+            "character_level": {"n :: m": 1, "ſ :: f": 1},
+            "word_level": {
+                "Augenblick :: Augemblick": 1,
+                "Verſprochene :: Verfprochene": 1,
+            },
+        }
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@ -5,7 +5,7 @@ import os
 import pytest
 from lxml import etree as ET

-from .. import distance, page_text, alto_text
+from .. import alto_text, distance, page_text

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@ -1,21 +1,20 @@
+import json
 import os
 import shutil
-import json
 import sys
 from pathlib import Path

 import pytest
 from click.testing import CliRunner
-from .util import working_directory
-

 from ..ocrd_cli import ocrd_dinglehopper
+from .util import working_directory

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
-@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix")
+@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
 def test_ocrd_cli(tmp_path):
    """Test OCR-D interface"""

--- a/src/dinglehopper/tests/test_integ_summarize.py
+++ b/src/dinglehopper/tests/test_integ_summarize.py
@ -0,0 +1,110 @@
+import json
+import os
+
+import pytest
+
+from .. import cli_summarize
+from .util import working_directory
+
+expected_cer_avg = (0.05 + 0.10) / 2
+expected_wer_avg = (0.15 + 0.20) / 2
+expected_diff_c = {"a": 30, "b": 50}
+expected_diff_w = {"c": 70, "d": 90}
+
+
+@pytest.fixture
+def create_summaries(tmp_path):
+    """Create two summary reports with mock data"""
+    reports_dirname = tmp_path / "reports"
+    reports_dirname.mkdir()
+
+    report1 = {
+        "cer": 0.05,
+        "wer": 0.15,
+        "differences": {
+            "character_level": {"a": 10, "b": 20},
+            "word_level": {"c": 30, "d": 40},
+        },
+    }
+    report2 = {
+        "cer": 0.10,
+        "wer": 0.20,
+        "differences": {
+            "character_level": {"a": 20, "b": 30},
+            "word_level": {"c": 40, "d": 50},
+        },
+    }
+
+    with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
+        json.dump(report1, f)
+    with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
+        json.dump(report2, f)
+
+    return str(reports_dirname)
+
+
+@pytest.mark.integration
+def test_cli_summarize_json(tmp_path, create_summaries):
+    """Test that the cli/process() yields a summarized JSON report"""
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+        cli_summarize.process(reports_dirname)
+
+        with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
+            summary_data = json.load(f)
+
+        assert summary_data["num_reports"] == 2
+        assert summary_data["cer_avg"] == expected_cer_avg
+        assert summary_data["wer_avg"] == expected_wer_avg
+        assert summary_data["differences"]["character_level"] == expected_diff_c
+        assert summary_data["differences"]["word_level"] == expected_diff_w
+
+
+@pytest.mark.integration
+def test_cli_summarize_html(tmp_path, create_summaries):
+    """Test that the cli/process() yields an HTML report"""
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+        cli_summarize.process(reports_dirname)
+
+        html_file = os.path.join(reports_dirname, "summary.html")
+        assert os.path.isfile(html_file)
+
+        with open(html_file, "r") as f:
+            contents = f.read()
+
+            assert len(contents) > 0
+            assert "Number of reports: 2" in contents
+            assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
+            assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
+
+
+@pytest.mark.integration
+def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
+    """
+    Test that the cli/process() does not include reports that are missing a WER value.
+    """
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+
+        # This third report has no WER value and should not be included in the summary
+        report3 = {
+            "cer": 0.10,
+            "differences": {
+                "character_level": {"a": 20, "b": 30},
+                "word_level": {"c": 40, "d": 50},
+            },
+        }
+
+        with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
+            json.dump(report3, f)
+
+        cli_summarize.process(reports_dirname)
+
+        html_file = os.path.join(reports_dirname, "summary.html")
+        assert os.path.isfile(html_file)
+
+        with open(html_file, "r") as f:
+            contents = f.read()
+
+            assert "Number of reports: 2" in contents  # report3 is not included
--- a/qurator/dinglehopper/tests/test_integ_table_extraction.py
+++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@ -5,15 +5,15 @@ import os
 import pytest
 from lxml import etree as ET

-from .. import word_error_rate, words, page_text, alto_text
+from .. import alto_text, page_text, word_error_rate, words

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
-    # the ligature does not count → 2 errors
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # So we have 3 changed words, the ligature does not count → 2 errors
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))

    gt_word_count = (
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@ -1,13 +1,11 @@
 import os
 import re
-
-import lxml.etree as ET
 import textwrap

-import pytest
+import lxml.etree as ET

-from .util import working_directory
 from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
+from .util import working_directory

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

@ -161,7 +159,8 @@ def test_page_level():
    result = page_text(tree, textequiv_level="line")
    assert (
        result
-        == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
+        == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
+        + "Als er einsmals in dem Oberhauſe eine Bill we-"
    )


--- a/qurator/dinglehopper/tests/test_word_error_rate.py
+++ b/qurator/dinglehopper/tests/test_word_error_rate.py
@ -27,7 +27,8 @@ def test_words():
 def test_words_private_use_area():
    result = list(
        words(
-            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
+            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
+            "der Frau Amtmnnin das ver⸗\n"
            "ſproene zu berliefern."
        )
    )
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@ -1,8 +1,8 @@
+import os
 from itertools import zip_longest
 from typing import Iterable

 import colorama
-import os


 def diffprint(x, y):
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`__import__("pkg_resources").declare_namespace(__name__)`