Merge branch 'master' into performance

2025-10-15 10:39:54 +02:00 · 2024-01-02 20:22:38 +01:00 · 2024-01-02 20:22:38 +01:00 · 38fcbc8e1c
commit 38fcbc8e1c
parent 68a12f8f7f f077ce2e1b
101 changed files with 58154 additions and 199 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,23 +0,0 @@
-version: 2.1
-
-jobs:
-  test:
-    parameters:
-      python-version:
-        type: string
-    docker:
-      - image: cimg/python:<< parameters.python-version >>
-    steps:
-      - checkout
-      - run: pip3 install --upgrade pip
-      - run: pip3 install -r requirements.txt
-      - run: pip3 install pytest
-      - run: pytest
-
-workflows:
-  all-tests:
-    jobs:
-      - test:
-          matrix:
-            parameters:
-              python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
--- a/.editorconfig
+++ b/.editorconfig
@ -15,7 +15,7 @@ indent_size = 2

 [*.json]
 indent_size = 2
-insert_final_newline = false
+insert_final_newline = true

 # trailing spaces in markdown indicate word wrap
 [*.md]
--- a/.github/workflows/release-check-version-tag
+++ b/.github/workflows/release-check-version-tag
@ -0,0 +1,14 @@
+#!/bin/bash
+
+# We call setuptools.setup() here as we may rely on setuptools to interpret
+# a dynamic version field. (Reading pyproject.toml is not enough in that case.)
+expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
+actual_git_tag="$(git describe --tags)"
+
+if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
+  echo "OK: Python package version $expected_git_tag matches git tag"
+  exit 0
+else
+  echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
+  exit 1
+fi
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,69 @@
+name: release
+
+on:
+  push:
+    tags:
+      - "v*.*.*"
+
+env:
+  PYPI_URL: https://pypi.org/p/dinglehopper
+
+jobs:
+  test:
+    uses: ./.github/workflows/test.yml
+
+  build:
+    needs: test
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Upgrade pip
+        run: python3 -m pip install --upgrade pip
+      - name: Install setuptools
+        run: |
+          python3 -m pip install --upgrade setuptools
+          # For OCR-D tools, we need setuptools-ocrd to get the version
+          if [ -e ocrd-tool.json ]; then
+            python3 -m pip install setuptools-ocrd
+          fi
+      - name: Check git tag vs package version
+        run: .github/workflows/release-check-version-tag
+      - name: Build package
+        run: python3 -m pip install --upgrade build && python3 -m build
+      - name: Upload dist
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: dist/
+
+  github-release:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download dist
+        uses: actions/download-artifact@v3
+        with:
+          name: dist
+          path: dist/
+      - name: Create release on GitHub
+        uses: softprops/action-gh-release@v1
+        with:
+          files: dist/*
+
+  pypi-publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: ${{ env.PYPI_URL }}
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
+    steps:
+      - name: Download dist
+        uses: actions/download-artifact@v3
+        with:
+          name: dist
+          path: dist/
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,76 @@
+name: test
+
+on:
+
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+  schedule:
+    - cron: "00 16 07 * *"  # = monthly
+
+  # Allow manually running (from GitHub Web)
+  workflow_dispatch:
+
+  # Allow calling this workflow (e.g. from release workflow)
+  workflow_call:
+
+jobs:
+  test:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
+
+    # For Python 3.6, we need to fall back to Ubuntu 20.04
+    runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
+
+    env:
+      test_results_dir: test-results-${{ matrix.python-version }}
+
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Update pip
+        run: python3 -m pip install -U pip
+      - name: Avoid compiling OpenCV and NumPy on Python 3.6
+        run: |
+          if python3 --version | grep -q "Python 3.6"; then
+             pip install --prefer-binary -U opencv-python-headless numpy
+          fi
+      - name: Install requirements*.txt
+        run: |
+          for requirements_txt in requirements*.txt; do
+            python3 -m pip install -r $requirements_txt;
+          done
+
+      - name: Test
+        run: |
+            cd src
+            mkdir -p ../$test_results_dir
+            python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
+      - name: Upload test results
+        uses: actions/upload-artifact@v3
+        if: success() || failure()
+        with:
+          name: ${{ env.test_results_dir }}
+          path: ${{ env.test_results_dir }}
+
+      - name: Report tests
+        uses: dorny/test-reporter@v1
+        if: success() || failure()
+        with:
+          name: Results on Python ${{ matrix.python-version }}
+          path: "${{env.test_results_dir }}/junit.xml"
+          reporter: java-junit
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@ htmlcov/
 .venv
 env/
 venv/
+.python-version

 # mypy
 .mypy_cache/
@ -27,3 +28,4 @@ dmypy.json

 # Build artifacts
 /build
+/dist
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,36 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-json
+    -   id: check-toml
+    -   id: check-yaml
+    -   id: check-added-large-files
+    -   id: check-ast
+
+-   repo: https://github.com/psf/black
+    rev: 23.10.0
+    hooks:
+    -   id: black
+
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.1
+    hooks:
+    -   args:
+        - --fix
+        - --exit-non-zero-on-fix
+        id: ruff
+
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.6.1
+    hooks:
+    -   additional_dependencies:
+        - types-setuptools
+        id: mypy
+
+-   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
+    rev: v0.1.0
+    hooks:
+    -   id: pre-commit-update
--- a/README-DEV.md
+++ b/README-DEV.md
@ -1,6 +1,6 @@
 Testing
 =======
-Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
+Use `pytest` to run the tests in [the tests directory](dinglehopper/tests):
 ```bash
 virtualenv -p /usr/bin/python3 venv
 . venv/bin/activate
@ -10,6 +10,7 @@ pytest
 ```

 ## Test running examples
+
 Only unit tests:
 ```bash
 pytest -m "not integration"
@ -27,11 +28,18 @@ pytest

 All tests with code coverage:
 ```bash
-pytest --cov=qurator --cov-report=html
+pytest --cov=dinglehopper --cov-report=html
 ```

 Static code analysis:
 ```bash
-pytest -k "not test" --flake8
 pytest -k "not test" --mypy
+pytest -k "not test" --ruff
 ```
+
+# How to use pre-commit
+
+This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
+
+- Install pre-commit, e.g. `pip install -r requirements-dev.txt`
+- Install the repo-local git hooks: `pre-commit install`
--- a/README.md
+++ b/README.md
@ -5,9 +5,13 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report.
+metrics and a word/character differences report. It also supports batch processing by
+generating, aggregating and summarizing multiple reports.

-[![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper)
+[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
+[![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
+[![License](https://img.shields.io/badge/License-Apache-blue)](#license)
+[![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)

 Goals
 -----
@ -19,15 +23,16 @@ Goals

 Installation
 ------------
-It's best to use pip, e.g.:
-~~~
-sudo pip install .
-~~~
+
+It's best to use pip to install the package from PyPI, e.g.:
+```
+pip install dinglehopper
+```

 Usage
 -----
 ~~~
-Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
+Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]

  Compare the PAGE/ALTO/text document GT against the document OCR.

@ -35,19 +40,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  their text and falls back to plain text if no ALTO or PAGE is detected.

  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
-  In that case, use --no-metrics to disable the then meaningless metrics and
-  also change the color scheme from green/red to blue.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
+  that case, use --no-metrics to disable the then meaningless metrics and also
+  change the color scheme from green/red to blue.

-  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
-  error rate (CER) and the word error rate (WER).
+  The comparison report will be written to
+  $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
+  to the current working directory and $REPORT_PREFIX defaults to "report".
+  The reports include the character error rate (CER) and the word error rate
+  (WER).

  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.

 Options:
  --metrics / --no-metrics  Enable/disable metrics and green/red
+  --differences BOOLEAN     Enable reporting character and word level
+                            differences
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --progress                Show progress bar
  --help                    Show this message and exit.
@ -61,6 +70,43 @@ This generates `report.html` and `report.json`.

 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)

+Batch comparison between folders of GT and OCR files can be done by simply providing
+folders:
+~~~
+dinglehopper gt/ ocr/ report output_folder/
+~~~
+This assumes that you have files with the same name in both folders, e.g.
+`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
+
+The example generates reports for each set of files, with the prefix `report`, in the
+(automatically created) folder `output_folder/`.
+
+By default, the JSON report does not contain the character and word differences, only
+the calculated metrics. If you want to include the differences, use the
+`--differences` flag:
+
+~~~
+dinglehopper gt/ ocr/ report output_folder/ --differences
+~~~
+
+### dinglehopper-summarize
+A set of (JSON) reports can be summarized into a single set of
+reports. This is useful after having generated reports in batch.
+Example:
+~~~
+dinglehopper-summarize output_folder/
+~~~
+This generates `summary.html` and `summary.json` in the same `output_folder`.
+
+If you are summarizing many reports and have used the `--differences` flag while
+generating them, it may be useful to limit the number of differences reported by using
+the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
+report, making it easier to open and navigate. Note that the JSON report will still
+contain all differences. Example:
+~~~
+dinglehopper-summarize output_folder/ --occurences-threshold 10
+~~~
+
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
--- a/ocrd-tool.json
+++ b/ocrd-tool.json
@ -1 +1 @@
-qurator/dinglehopper/ocrd-tool.json
+src/dinglehopper/ocrd-tool.json
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,70 @@
+[build-system]
+requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
+
+[project]
+name = "dinglehopper"
+authors = [
+    {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
+    {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
+]
+description = "The OCR evaluation tool"
+readme = "README.md"
+requires-python = ">=3.6"
+keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
+
+dynamic = ["version", "dependencies", "optional-dependencies"]
+
+# https://pypi.org/classifiers/
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Other Audience",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Topic :: Text Processing",
+]
+
+[project.scripts]
+dinglehopper = "dinglehopper.cli:main"
+dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
+dinglehopper-extract = "dinglehopper.cli_extract:main"
+dinglehopper-summarize = "dinglehopper.cli_summarize:main"
+ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
+
+
+[project.urls]
+Homepage = "https://github.com/qurator-spk/dinglehopper"
+Repository = "https://github.com/qurator-spk/dinglehopper.git"
+
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+optional-dependencies.dev = {file = ["requirements-dev.txt"]}
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+dinglehopper = ["templates/*"]
+
+
+[tool.pytest.ini_options]
+minversion = 6.0
+addopts = "--strict-markers"
+markers = [
+    "integration: integration tests",
+]
+
+
+[tool.mypy]
+ignore_missing_imports = true
+
+
+[tool.ruff]
+select = ["E", "F", "I"]
+ignore = [
+    "F811",  # multimethods are considered redefinitions by ruff
+]
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +0,0 @@
-[pytest]
-markers =
-    integration: integration tests
-    serial
--- a/qurator/init.py
+++ b/qurator/init.py
@ -1 +0,0 @@
-__import__("pkg_resources").declare_namespace(__name__)
--- a/qurator/dinglehopper/init.py
+++ b/qurator/dinglehopper/init.py
@ -1,5 +0,0 @@
-from .ocr_files import *
-from .extracted_text import *
-from .character_error_rate import *
-from .word_error_rate import *
-from .align import *
--- a/qurator/dinglehopper/templates/report.html.js
+++ b/qurator/dinglehopper/templates/report.html.js
@ -1,15 +0,0 @@
-function find_diff_class(classes) {
-    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
-}
-
-$(document).ready(function() {
-    /* Enable Bootstrap tooltips */
-    $('[data-toggle="tooltip"]').tooltip();
-
-    $('.diff').mouseover(function() {
-        find_diff_class($(this).attr('class')).addClass('diff-highlight');
-    });
-    $('.diff').mouseout(function() {
-        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
-    });
-});
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,5 +1,8 @@
 pytest
-pytest-flake8
 pytest-cov
 pytest-mypy
 black
+pre-commit
+
+ruff ; python_version >= "3.7"
+pytest-ruff ; python_version >= "3.7"
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ attrs
 multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.7.0
-six  # XXX workaround OCR-D/core#730
+chardet
--- a/setup.cfg
+++ b/setup.cfg
@ -1,12 +0,0 @@
-[flake8]
-max-line-length = 88
-extend-ignore = E203, W503
-
-[pylint]
-max-line-length = 88
-
-[pylint.messages_control]
-disable = C0330, C0326
-
-[mypy]
-ignore_missing_imports = True
--- a/setup.py
+++ b/setup.py
@ -1,34 +0,0 @@
-from io import open
-from setuptools import find_packages, setup
-
-with open("requirements.txt") as fp:
-    install_requires = fp.read()
-
-with open('requirements-dev.txt') as fp:
-    tests_require = fp.read()
-
-setup(
-    name="dinglehopper",
-    author="Mike Gerber, The QURATOR SPK Team",
-    author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
-    description="The OCR evaluation tool",
-    long_description=open("README.md", "r", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    keywords="qurator ocr",
-    license="Apache",
-    namespace_packages=["qurator"],
-    packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
-    install_requires=install_requires,
-    tests_require=tests_require,
-    package_data={
-        "": ["*.json", "templates/*"],
-    },
-    entry_points={
-        "console_scripts": [
-            "dinglehopper=qurator.dinglehopper.cli:main",
-            "dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
-            "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
-            "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
-        ]
-    },
-)
--- a/src/dinglehopper/init.py
+++ b/src/dinglehopper/init.py
@ -0,0 +1,33 @@
+from .align import align, score_hint, seq_align
+from .character_error_rate import character_error_rate, character_error_rate_n
+from .edit_distance import distance, editops
+from .extracted_text import ExtractedText
+from .ocr_files import (
+    alto_namespace,
+    alto_text,
+    page_namespace,
+    page_text,
+    plain_text,
+    text,
+)
+from .word_error_rate import word_error_rate, word_error_rate_n, words
+
+__all__ = [
+    "editops",
+    "distance",
+    "align",
+    "score_hint",
+    "seq_align",
+    "character_error_rate",
+    "character_error_rate_n",
+    "word_error_rate",
+    "word_error_rate_n",
+    "words",
+    "ExtractedText",
+    "alto_namespace",
+    "alto_text",
+    "page_namespace",
+    "page_text",
+    "plain_text",
+    "text",
+]
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,9 +1,12 @@
 import math
+import unicodedata
 from math import ceil

-from .edit_distance import *
 from rapidfuzz.distance import Levenshtein

+from .edit_distance import grapheme_clusters
+
+
 def align(t1, t2):
    """Align text."""
    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -1,20 +1,22 @@
 import os
+from collections import Counter

 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from ocrd_utils import initLogging
-from math import ceil

-from .character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align, score_hint
-from .extracted_text import ExtractedText
-from .ocr_files import extract
-from .config import Config
+from dinglehopper.align import score_hint, seq_align
+from dinglehopper.character_error_rate import character_error_rate_n
+from dinglehopper.config import Config
+from dinglehopper.extracted_text import ExtractedText
+from dinglehopper.ocr_files import extract
+from dinglehopper.word_error_rate import word_error_rate_n, words_normalized


-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
+def gen_diff_report(
+    gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
+):
    gtx = ""
    ocrx = ""

@ -31,16 +33,12 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):

        # Set Bootstrap tooltip to the segment id
        if id_:
-            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
+            html_custom_attrs += f'data-toggle="tooltip" title="{id_}"'

        if css_classes:
-            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
-                css_classes=css_classes,
-                html_t=html_t,
-                html_custom_attrs=html_custom_attrs,
-            )
+            return f'<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'
        else:
-            return "{html_t}".format(html_t=html_t)
+            return f"{html_t}"

    if isinstance(gt_in, ExtractedText):
        if not isinstance(ocr_in, ExtractedText):
@ -53,6 +51,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):

    g_pos = 0
    o_pos = 0
+    found_differences = []
+
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
        css_classes = None
        gt_id = None
@ -65,6 +65,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
                # Deletions and inserts only produce one id + None, UI must
                # support this, i.e. display for the one id produced

+            if differences:
+                found_differences.append(f"{g} :: {o}")
+
        gtx += joiner + format_thing(g, css_classes, gt_id)
        ocrx += joiner + format_thing(o, css_classes, ocr_id)

@ -73,13 +76,18 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
        if o is not None:
            o_pos += len(o)

-    return """
+    found_differences = dict(Counter(elem for elem in found_differences))
+
+    return (
+        """
        <div class="row">
           <div class="col-md-6 gt">{}</div>
           <div class="col-md-6 ocr">{}</div>
        </div>
        """.format(
-        gtx, ocrx
+            gtx, ocrx
+        ),
+        found_differences,
    )


@ -96,11 +104,20 @@ def json_float(value):
        return str(value)


-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(
+    gt,
+    ocr,
+    report_prefix,
+    reports_folder=".",
+    *,
+    metrics=True,
+    differences=False,
+    textequiv_level="region",
+):
    """Check OCR result against GT.

-    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
-    Click on a wrapper.
+    The @click decorators change the signature of the decorated functions, so we keep
+    this undecorated version and use Click on a wrapper.
    """

    gt_text = extract(gt, textequiv_level=textequiv_level)
@ -109,15 +126,25 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    ocr_words = words_normalized(ocr_text)

    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·",
-        score_hint=score_hint(cer, n_characters)
+    char_diff_report, diff_c = gen_diff_report(
+        gt_text,
+        ocr_text,
+        css_prefix="c",
+        joiner="",
+        none="·",
+        score_hint=score_hint(cer, n_characters),
+        differences=differences,
    )

    wer, n_words = word_error_rate_n(gt_words, ocr_words)
-    word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
-        score_hint=score_hint(wer, n_words)
+    word_diff_report, diff_w = gen_diff_report(
+        gt_words,
+        ocr_words,
+        css_prefix="w",
+        joiner=" ",
+        none="⋯",
+        score_hint=score_hint(wer, n_words),
+        differences=differences,
    )

    env = Environment(
@ -129,7 +156,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):

    for report_suffix in (".html", ".json"):
        template_fn = "report" + report_suffix + ".j2"
-        out_fn = report_prefix + report_suffix
+
+        if not os.path.isdir(reports_folder):
+            os.mkdir(reports_folder)
+
+        out_fn = os.path.join(reports_folder, report_prefix + report_suffix)

        template = env.get_template(template_fn)
        template.stream(
@ -142,16 +173,46 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
+            differences=differences,
+            diff_c=diff_c,
+            diff_w=diff_w,
        ).dump(out_fn)


+def process_dir(
+    gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
+):
+    for gt_file in os.listdir(gt):
+        gt_file_path = os.path.join(gt, gt_file)
+        ocr_file_path = os.path.join(ocr, gt_file)
+
+        if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
+            process(
+                gt_file_path,
+                ocr_file_path,
+                f"{gt_file}-{report_prefix}",
+                reports_folder=reports_folder,
+                metrics=metrics,
+                differences=differences,
+                textequiv_level=textequiv_level,
+            )
+        else:
+            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
+
+
@click.command()
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
+@click.argument("reports_folder", type=click.Path(), default=".")
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
+@click.option(
+    "--differences",
+    default=False,
+    help="Enable reporting character and word level differences",
+)
@click.option(
    "--textequiv-level",
    default="region",
@ -159,7 +220,16 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    metavar="LEVEL",
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(
+    gt,
+    ocr,
+    report_prefix,
+    reports_folder,
+    metrics,
+    differences,
+    textequiv_level,
+    progress,
+):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.

@ -171,7 +241,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.

-    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
+    where $REPORTS_FOLDER defaults to the current working directory and
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

@ -180,7 +251,31 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    initLogging()
    Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    if os.path.isdir(gt):
+        if not os.path.isdir(ocr):
+            raise click.BadParameter(
+                "OCR must be a directory if GT is a directory", param_hint="ocr"
+            )
+        else:
+            process_dir(
+                gt,
+                ocr,
+                report_prefix,
+                reports_folder,
+                metrics,
+                differences,
+                textequiv_level,
+            )
+    else:
+        process(
+            gt,
+            ocr,
+            report_prefix,
+            reports_folder,
+            metrics=metrics,
+            differences=differences,
+            textequiv_level=textequiv_level,
+        )


 if __name__ == "__main__":
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@ -1,15 +1,15 @@
-import os
 import itertools
+import os

 import click
 from jinja2 import Environment, FileSystemLoader
 from ocrd_utils import initLogging
-from math import ceil

+from .align import score_hint
 from .character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
-from .ocr_files import plain_extract
 from .cli import gen_diff_report, json_float
+from .ocr_files import plain_extract
+from .word_error_rate import word_error_rate_n, words_normalized


 def all_equal(iterable):
@ -75,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):

        # Generate diff reports
        char_diff_report += gen_diff_report(
-            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
-            score_hint=score_hint(l_cer, l_n_characters)
+            gt_text,
+            ocr_text,
+            css_prefix="l{0}-c".format(k),
+            joiner="",
+            none="·",
+            score_hint=score_hint(l_cer, l_n_characters),
        )
        word_diff_report += gen_diff_report(
-            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
-            score_hint=score_hint(l_wer, l_n_words)
+            gt_words,
+            ocr_words,
+            css_prefix="l{0}-w".format(k),
+            joiner=" ",
+            none="⋯",
+            score_hint=score_hint(l_wer, l_n_words),
        )

    env = Environment(
--- a/src/dinglehopper/cli_summarize.py
+++ b/src/dinglehopper/cli_summarize.py
@ -0,0 +1,106 @@
+import json
+import os
+
+import click
+from jinja2 import Environment, FileSystemLoader
+from ocrd_utils import initLogging
+
+from dinglehopper.cli import json_float
+
+
+def process(reports_folder, occurrences_threshold=1):
+    cer_list = []
+    wer_list = []
+    cer_sum = 0
+    wer_sum = 0
+    diff_c = {}
+    diff_w = {}
+
+    for report in os.listdir(reports_folder):
+        if report.endswith(".json"):
+            with open(os.path.join(reports_folder, report), "r") as f:
+                report_data = json.load(f)
+
+                if "cer" not in report_data or "wer" not in report_data:
+                    click.echo(
+                        f"Skipping {report} because it does not contain CER and WER"
+                    )
+                    continue
+
+                cer = report_data["cer"]
+                wer = report_data["wer"]
+                cer_list.append(cer)
+                wer_list.append(wer)
+                cer_sum += cer
+                wer_sum += wer
+
+                try:
+                    for key, value in report_data["differences"][
+                        "character_level"
+                    ].items():
+                        diff_c[key] = diff_c.get(key, 0) + value
+                    for key, value in report_data["differences"]["word_level"].items():
+                        diff_w[key] = diff_w.get(key, 0) + value
+                except KeyError:
+                    pass
+
+    if len(cer_list) == 0:
+        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
+        return
+
+    cer_avg = cer_sum / len(cer_list)
+    wer_avg = wer_sum / len(wer_list)
+
+    print(f"Number of reports: {len(cer_list)}")
+    print(f"Average CER: {cer_avg}")
+    print(f"Average WER: {wer_avg}")
+    print(f"Sum of common mistakes: {cer_sum}")
+    print(f"Sum of common mistakes: {wer_sum}")
+
+    env = Environment(
+        loader=FileSystemLoader(
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+        )
+    )
+    env.filters["json_float"] = json_float
+    for report_suffix in (".html", ".json"):
+        template_fn = "summary" + report_suffix + ".j2"
+
+        out_fn = os.path.join(reports_folder, "summary" + report_suffix)
+        template = env.get_template(template_fn)
+        template.stream(
+            num_reports=len(cer_list),
+            cer_avg=cer_avg,
+            wer_avg=wer_avg,
+            diff_c=diff_c,
+            diff_w=diff_w,
+            occurrences_threshold=occurrences_threshold,
+        ).dump(out_fn)
+
+
+@click.command()
+@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
+@click.option(
+    "--occurrences-threshold",
+    type=int,
+    default=1,
+    help="Only show differences that occur at least this many times.",
+)
+def main(reports_folder, occurrences_threshold):
+    """
+    Summarize the results from multiple reports generated earlier by dinglehopper.
+    It calculates the average CER and WER, as well as a sum of common mistakes.
+    Reports include lists of mistakes and their occurrences.
+
+    You may use a threshold to reduce the file size of the HTML report by only showing
+    mistakes whose number of occurrences is above the threshold. The JSON report will
+    always contain all mistakes.
+
+    All JSON files in the provided folder will be gathered and summarized.
+    """
+    initLogging()
+    process(reports_folder, occurrences_threshold)
+
+
+if __name__ == "__main__":
+    main()
--- a/qurator/dinglehopper/config.py
+++ b/qurator/dinglehopper/config.py
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,8 +1,8 @@
 import unicodedata

 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
+from uniseg.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText

--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
--- a/qurator/dinglehopper/notebooks/Unicode
+++ b/qurator/dinglehopper/notebooks/Unicode
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -2,6 +2,7 @@ import os
 import sys
 from typing import Iterator

+import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
 from uniseg.graphemecluster import grapheme_clusters
@ -12,8 +13,8 @@ from .extracted_text import ExtractedText, normalize_sbb
 def alto_namespace(tree: ET.ElementTree) -> str:
    """Return the ALTO namespace used in the given ElementTree.

-    This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
-    check if the files uses any valid ALTO namespace.
+    This relies on the assumption that, in any given ALTO file, the root element has the
+    local name "alto". We do not check if the files uses any valid ALTO namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "alto":
@ -48,8 +49,9 @@ def alto_text(tree):
 def page_namespace(tree):
    """Return the PAGE content namespace used in the given ElementTree.

-    This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
-    do not check if the files uses any valid PAGE namespace.
+    This relies on the assumption that, in any given PAGE content file, the root element
+    has the local name "PcGts". We do not check if the files uses any valid PAGE
+    namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "PcGts":
@ -135,6 +137,10 @@ def page_text(tree, *, textequiv_level="region"):
    return page_extract(tree, textequiv_level=textequiv_level).text


+def detect_encoding(filename):
+    return chardet.detect(open(filename, "rb").read(1024))["encoding"]
+
+
 def plain_extract(filename, include_filename_in_id=False):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"

@ -149,7 +155,8 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )

-    with open(filename, "r") as f:
+    fileencoding = detect_encoding(filename)
+    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
            [make_segment(no, line) for no, line in enumerate(f.readlines())],
@ -171,7 +178,7 @@ def extract(filename, *, textequiv_level="region"):
    """
    try:
        tree = ET.parse(filename)
-    except XMLSyntaxError:
+    except (XMLSyntaxError, UnicodeDecodeError):
        return plain_extract(filename)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
--- a/qurator/dinglehopper/ocrd-tool.json
+++ b/qurator/dinglehopper/ocrd-tool.json
@ -1,4 +1,5 @@
 {
+  "version": "0.9.4",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -4,7 +4,7 @@ import os
 import click
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
+from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
 from pkg_resources import resource_string

 from .cli import process as cli_process
--- a/qurator/dinglehopper/templates/report.html.j2
+++ b/qurator/dinglehopper/templates/report.html.j2
@ -26,6 +26,22 @@
      border: 2px solid;
      border-radius: 5px;
    }
+
+    .row {
+        margin-bottom: 20px;
+    }
+
+    table {
+        width: 100%;
+    }
+
+    th {
+        cursor: pointer;
+    }
+
+    th:hover {
+        background-color: #eee;
+    }
    </style>
 </head>
 <body>
@ -50,6 +66,32 @@
 <h2>Word differences</h2>
 {{ word_diff_report }}

+{%- if differences %}
+{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
+
+<div class="row">
+{% for section in sections %}
+    <div class="col-md-6">
+        <h2>{{ section['title'] }}</h2>
+        <table>
+            <thead>
+            <tr>
+                <th>GT</th>
+                <th>OCR</th>
+                <th>Occurrences</th>
+            </tr>
+            {% for gt_ocr, occurrences in section['data'].items() %}
+                <tr>
+                    <td>{{ gt_ocr.split("::")[0] }}</td>
+                    <td>{{ gt_ocr.split("::")[1] }}</td>
+                    <td>{{ occurrences }}</td>
+                </tr>
+            {% endfor %}
+        </table>
+    </div>
+{% endfor %}
+</div>
+{%- endif %}

 </div>

--- a/src/dinglehopper/templates/report.html.js
+++ b/src/dinglehopper/templates/report.html.js
@ -0,0 +1,39 @@
+function find_diff_class(classes) {
+    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
+}
+
+$(document).ready(function() {
+    /* Enable Bootstrap tooltips */
+    $('[data-toggle="tooltip"]').tooltip();
+
+    $('.diff').mouseover(function() {
+        find_diff_class($(this).attr('class')).addClass('diff-highlight');
+    });
+    $('.diff').mouseout(function() {
+        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
+    });
+
+    /* Sort this column of the table */
+    $('th').click(function () {
+        var table = $(this).closest('table');
+        var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
+        this.asc = !this.asc;
+        if (!this.asc) {
+            rows = rows.reverse();
+        }
+        for (var i = 0; i < rows.length; i++) {
+            table.children('tbody').append(rows[i]);
+        }
+    });
+
+    function compareRows(index) {
+        return function (row1, row2) {
+            var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
+            var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
+            return cell1.localeCompare(cell2, undefined, {
+                numeric: true,
+                sensitivity: 'base'
+            });
+        }
+    }
+});
--- a/qurator/dinglehopper/templates/report.json.j2
+++ b/qurator/dinglehopper/templates/report.json.j2
@ -4,6 +4,12 @@
 {% if metrics %}
    "cer": {{ cer|json_float }},
    "wer": {{ wer|json_float }},
+{% endif %}
+{% if differences %}
+    "differences": {
+        "character_level": {{ diff_c|tojson }},
+        "word_level": {{ diff_w|tojson }}
+    },
 {% endif %}
    "n_characters": {{ n_characters }},
    "n_words": {{ n_words }}
--- a/src/dinglehopper/templates/summary.html.j2
+++ b/src/dinglehopper/templates/summary.html.j2
@ -0,0 +1,136 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
+    <style type="text/css">
+    {% if metrics %}
+    .gt .diff {
+        color: green;
+    }
+    .ocr .diff {
+        color: red;
+    }
+    {% else %}
+    .gt .diff, .ocr .diff {
+        color: blue;
+    }
+    {% endif %}
+    .ellipsis {
+        opacity: 0.5;
+        font-style: italic;
+    }
+    .diff-highlight {
+      border: 2px solid;
+      border-radius: 5px;
+    }
+
+    .row {
+        margin-bottom: 20px;
+    }
+
+    table {
+        width: 100%;
+    }
+
+    .cer {
+        flex-direction: column;
+    }
+
+    tr:hover {
+        background-color: #f5f5f5;
+    }
+
+    th {
+        cursor: pointer;
+    }
+
+    th:hover {
+        background-color: #eee;
+    }
+
+    td {
+        min-width: 100px;
+    }
+
+    td:hover {
+        background-color: #eee;
+    }
+    </style>
+</head>
+<body>
+
+<div class="container">
+
+<div class="row">
+    <h1>Summary of all reports</h1>
+</div>
+
+<div class="row">
+    <p>Number of reports: {{ num_reports }}</p>
+</div>
+
+{% if cer_avg and wer_avg -%}
+<div class="row">
+    <h2>Metrics</h2>
+</div>
+
+<div class="row cer">
+    <p>Average CER: {{ cer_avg|round(4) }}</p>
+    <p>Average WER: {{ wer_avg|round(4) }}</p>
+</div>
+{% endif %}
+
+{%- if diff_c and diff_w %}
+{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
+
+<div class="row">
+{%- for section in sections %}
+    <div class="col-md-6">
+        <h2>{{ section['title'] }}</h2>
+        <table>
+            <thead>
+            <tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
+            </thead>
+            {%- set num_omitted = namespace(value=0) -%}
+            {% for gt_ocr, occurrences in section['data'].items() -%}
+                {% if occurrences < occurrences_threshold -%}
+                    {%- set num_omitted.value = num_omitted.value + 1 %}
+                {%- else -%}
+                    {%- set gt = gt_ocr.split(" :: ")[0] %}
+                    {%- set ocr = gt_ocr.split(" :: ")[1] %}
+                    <tr>
+                        <td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
+                        <td title="{{ ocr|urlencode }}">{{ ocr }}</td >
+                        <td>{{ occurrences }}</td>
+                    </tr>
+                {%- endif %}
+            {%- endfor %}
+
+            {% if num_omitted.value > 0  and occurrences_threshold > 1 -%}
+                <p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
+                {%- set num_omitted.value = 0 %}
+            {%- endif %}
+        </table>
+    </div>
+{%- endfor %}
+</div>
+{%- endif %}
+
+</div>
+
+
+
+<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
+<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
+
+<script>
+{% include 'report.html.js' %}
+</script>
+
+
+</body>
+</html>
--- a/src/dinglehopper/templates/summary.json.j2
+++ b/src/dinglehopper/templates/summary.json.j2
@ -0,0 +1,15 @@
+{
+"num_reports": {{ num_reports}}
+{%- if cer_avg and wer_avg %}
+    ,
+    "cer_avg": {{ cer_avg|json_float }},
+    "wer_avg": {{ wer_avg|json_float }}
+{%- endif %}
+{%- if diff_c and wer_avg %}
+    ,
+    "differences": {
+        "character_level": {{ diff_c|tojson }},
+        "word_level": {{ diff_w|tojson }}
+    }
+{%- endif %}
+}
--- a/qurator/dinglehopper/tests/init.py
+++ b/qurator/dinglehopper/tests/init.py
--- a/qurator/dinglehopper/tests/data/00000119.tif
+++ b/qurator/dinglehopper/tests/data/00000119.tif
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
--- a/src/dinglehopper/tests/data/bigger-texts/00008228/00008228-00236534.gt4hist.xml
+++ b/src/dinglehopper/tests/data/bigger-texts/00008228/00008228-00236534.gt4hist.xml
--- a/src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
+++ b/src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
--- a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
--- a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml
+++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml
--- a/qurator/dinglehopper/tests/data/test-gt.page2018.xml
+++ b/qurator/dinglehopper/tests/data/test-gt.page2018.xml
--- a/src/dinglehopper/tests/data/directory-test/gt/2.xml
+++ b/src/dinglehopper/tests/data/directory-test/gt/2.xml
--- a/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
+++ b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
--- a/src/dinglehopper/tests/data/directory-test/ocr/2.xml
+++ b/src/dinglehopper/tests/data/directory-test/ocr/2.xml
--- a/src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
+++ b/src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
--- a/qurator/dinglehopper/tests/data/levels-are-different.page.xml
+++ b/qurator/dinglehopper/tests/data/levels-are-different.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
--- a/qurator/dinglehopper/tests/data/mixed-regions.page.xml
+++ b/qurator/dinglehopper/tests/data/mixed-regions.page.xml
--- a/qurator/dinglehopper/tests/data/order.page.xml
+++ b/qurator/dinglehopper/tests/data/order.page.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-region.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-region.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
--- a/src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
+++ b/src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
--- a/src/dinglehopper/tests/data/test-gt.page2018.xml
+++ b/src/dinglehopper/tests/data/test-gt.page2018.xml
--- a/qurator/dinglehopper/tests/data/test.alto1.xml
+++ b/qurator/dinglehopper/tests/data/test.alto1.xml
@ -20183,4 +20183,4 @@
            </PrintSpace>
        </Page>
    </Layout>
-</alto>
+</alto>
--- a/qurator/dinglehopper/tests/data/test.alto2.xml
+++ b/qurator/dinglehopper/tests/data/test.alto2.xml
@ -61,4 +61,4 @@
 </PrintSpace>
 </Page>
 </Layout>
-</alto>
+</alto>
--- a/qurator/dinglehopper/tests/data/test.alto3.xml
+++ b/qurator/dinglehopper/tests/data/test.alto3.xml
--- a/src/dinglehopper/tests/data/test.page2018.xml
+++ b/src/dinglehopper/tests/data/test.page2018.xml
--- a/qurator/dinglehopper/tests/data/test.txt
+++ b/qurator/dinglehopper/tests/data/test.txt
@ -1 +1 @@
-Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/462875_0008.jpg
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/462875_0008.jpg
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-GT_0008.xml
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-GT_0008.xml
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-OCR-TESS_0008.xml
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-OCR-TESS_0008.xml
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -6,7 +6,7 @@ import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters

-from .. import seq_align, ExtractedText
+from .. import ExtractedText, seq_align


 def test_text():
@ -30,12 +30,20 @@ def test_text():

 def test_normalization_check():
    with pytest.raises(ValueError, match=r".*is not in NFC.*"):
-        ExtractedText("foo", None, None,
-                      unicodedata.normalize("NFD", "Schlyñ"),
-                      grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")))
-    assert ExtractedText("foo", None, None,
-                         unicodedata.normalize("NFC", "Schlyñ"),
-                         grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")))
+        ExtractedText(
+            "foo",
+            None,
+            None,
+            unicodedata.normalize("NFD", "Schlyñ"),
+            grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
+        )
+    assert ExtractedText(
+        "foo",
+        None,
+        None,
+        unicodedata.normalize("NFC", "Schlyñ"),
+        grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
+    )


 AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@ -1,7 +1,9 @@
 import math
+
 import pytest
+
+from .. import align, distance, score_hint, seq_align
 from .util import unzip
-from .. import align, seq_align, distance, score_hint


 def test_left_empty():
@ -72,7 +74,8 @@ def test_with_some_fake_ocr_errors():
    result = list(
        align(
            "Über die vielen Sorgen wegen desselben vergaß",
-            "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
+            "SomeJunk MoreJunk "
+            + "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
        )
    )
    left, right = unzip(result)
@ -183,6 +186,7 @@ def test_lines_similar():
    # Test __eq__ (i.e. is it a substitution or a similar string?)
    assert list(left)[0] == list(right)[0]

+
 def test_score_hint():
    assert score_hint(0.5, 23) == 12  # int(ceil())
    assert score_hint(math.inf, 12345) is None
--- a/qurator/dinglehopper/tests/test_character_error_rate.py
+++ b/qurator/dinglehopper/tests/test_character_error_rate.py
@ -36,6 +36,7 @@ def test_character_error_rate_hard():
        len(s2) == 7
    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points

-    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
+    # Both strings have the same length in terms of grapheme clusters. So the CER should
+    # be symmetrical.
    assert character_error_rate(s2, s1) == 1 / 6
    assert character_error_rate(s1, s2) == 1 / 6
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@ -15,7 +15,9 @@ def test_align_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    # → 2 elements in the alignment should be different, the ligature is
    # (currently) not counted due to normalization.
-    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
+    #
+    # NOTE: In this example, it doesn't matter that we work with "characters", not
+    # grapheme clusters.

    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
--- a/src/dinglehopper/tests/test_integ_bigger_texts.py
+++ b/src/dinglehopper/tests/test_integ_bigger_texts.py
@ -0,0 +1,28 @@
+from __future__ import division, print_function
+
+import os
+
+import pytest
+from lxml import etree as ET
+
+from .. import alto_text, character_error_rate, page_text
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_bigger_texts():
+    gt = page_text(
+        ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
+    )
+    ocr = alto_text(
+        ET.parse(
+            os.path.join(
+                data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
+            )
+        )
+    )
+
+    # Only interested in a result here: In earlier versions this would have used
+    # tens of GB of RAM and should now not break a sweat.
+    assert character_error_rate(gt, ocr) >= 0.0
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@ -6,7 +6,7 @@ import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters

-from .. import character_error_rate, page_text, alto_text
+from .. import alto_text, character_error_rate, page_text

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

--- a/src/dinglehopper/tests/test_integ_cli_dir.py
+++ b/src/dinglehopper/tests/test_integ_cli_dir.py
@ -0,0 +1,53 @@
+import os
+
+import pytest
+from ocrd_utils import initLogging
+
+from dinglehopper.cli import process_dir
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_directory(tmp_path):
+    """
+    Test that the cli/process_dir() processes a directory of files and
+    yields JSON and HTML reports.
+    """
+
+    initLogging()
+    process_dir(
+        os.path.join(data_dir, "directory-test", "gt"),
+        os.path.join(data_dir, "directory-test", "ocr"),
+        "report",
+        str(tmp_path / "reports"),
+        False,
+        True,
+        "line",
+    )
+
+    assert os.path.exists(tmp_path / "reports/1.xml-report.json")
+    assert os.path.exists(tmp_path / "reports/1.xml-report.html")
+    assert os.path.exists(tmp_path / "reports/2.xml-report.json")
+    assert os.path.exists(tmp_path / "reports/2.xml-report.html")
+
+
+@pytest.mark.integration
+def test_cli_fail_without_gt(tmp_path):
+    """
+    Test that the cli/process_dir skips a file if there is no corresponding file
+    in the other directory.
+    """
+
+    initLogging()
+    process_dir(
+        os.path.join(data_dir, "directory-test", "gt"),
+        os.path.join(data_dir, "directory-test", "ocr"),
+        "report",
+        str(tmp_path / "reports"),
+        False,
+        True,
+        "line",
+    )
+
+    assert len(os.listdir(tmp_path / "reports")) == 2 * 2
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -1,9 +1,9 @@
 import json

 import pytest
-from .util import working_directory

 from ..cli import process
+from .util import working_directory


@pytest.mark.integration
--- a/src/dinglehopper/tests/test_integ_differences.py
+++ b/src/dinglehopper/tests/test_integ_differences.py
@ -0,0 +1,37 @@
+import json
+import os
+
+import pytest
+from ocrd_utils import initLogging
+
+from dinglehopper.cli import process
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_differences(tmp_path):
+    """Test that the cli/process() yields a JSON report that includes
+    the differences found between the GT and OCR text"""
+
+    initLogging()
+    process(
+        os.path.join(data_dir, "test-gt.page2018.xml"),
+        os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
+        "report",
+        tmp_path,
+        differences=True,
+    )
+
+    assert os.path.exists(tmp_path / "report.json")
+
+    with open(tmp_path / "report.json", "r") as jsonf:
+        j = json.load(jsonf)
+
+        assert j["differences"] == {
+            "character_level": {"n :: m": 1, "ſ :: f": 1},
+            "word_level": {
+                "Augenblick :: Augemblick": 1,
+                "Verſprochene :: Verfprochene": 1,
+            },
+        }
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@ -5,7 +5,7 @@ import os
 import pytest
 from lxml import etree as ET

-from .. import distance, page_text, alto_text
+from .. import alto_text, distance, page_text

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@ -1,21 +1,20 @@
+import json
 import os
 import shutil
-import json
 import sys
 from pathlib import Path

 import pytest
 from click.testing import CliRunner
-from .util import working_directory
-

 from ..ocrd_cli import ocrd_dinglehopper
+from .util import working_directory

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
-@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix")
+@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
 def test_ocrd_cli(tmp_path):
    """Test OCR-D interface"""

--- a/src/dinglehopper/tests/test_integ_summarize.py
+++ b/src/dinglehopper/tests/test_integ_summarize.py
@ -0,0 +1,110 @@
+import json
+import os
+
+import pytest
+
+from .. import cli_summarize
+from .util import working_directory
+
+expected_cer_avg = (0.05 + 0.10) / 2
+expected_wer_avg = (0.15 + 0.20) / 2
+expected_diff_c = {"a": 30, "b": 50}
+expected_diff_w = {"c": 70, "d": 90}
+
+
+@pytest.fixture
+def create_summaries(tmp_path):
+    """Create two summary reports with mock data"""
+    reports_dirname = tmp_path / "reports"
+    reports_dirname.mkdir()
+
+    report1 = {
+        "cer": 0.05,
+        "wer": 0.15,
+        "differences": {
+            "character_level": {"a": 10, "b": 20},
+            "word_level": {"c": 30, "d": 40},
+        },
+    }
+    report2 = {
+        "cer": 0.10,
+        "wer": 0.20,
+        "differences": {
+            "character_level": {"a": 20, "b": 30},
+            "word_level": {"c": 40, "d": 50},
+        },
+    }
+
+    with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
+        json.dump(report1, f)
+    with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
+        json.dump(report2, f)
+
+    return str(reports_dirname)
+
+
+@pytest.mark.integration
+def test_cli_summarize_json(tmp_path, create_summaries):
+    """Test that the cli/process() yields a summarized JSON report"""
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+        cli_summarize.process(reports_dirname)
+
+        with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
+            summary_data = json.load(f)
+
+        assert summary_data["num_reports"] == 2
+        assert summary_data["cer_avg"] == expected_cer_avg
+        assert summary_data["wer_avg"] == expected_wer_avg
+        assert summary_data["differences"]["character_level"] == expected_diff_c
+        assert summary_data["differences"]["word_level"] == expected_diff_w
+
+
+@pytest.mark.integration
+def test_cli_summarize_html(tmp_path, create_summaries):
+    """Test that the cli/process() yields an HTML report"""
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+        cli_summarize.process(reports_dirname)
+
+        html_file = os.path.join(reports_dirname, "summary.html")
+        assert os.path.isfile(html_file)
+
+        with open(html_file, "r") as f:
+            contents = f.read()
+
+            assert len(contents) > 0
+            assert "Number of reports: 2" in contents
+            assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
+            assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
+
+
+@pytest.mark.integration
+def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
+    """
+    Test that the cli/process() does not include reports that are missing a WER value.
+    """
+    with working_directory(tmp_path):
+        reports_dirname = create_summaries
+
+        # This third report has no WER value and should not be included in the summary
+        report3 = {
+            "cer": 0.10,
+            "differences": {
+                "character_level": {"a": 20, "b": 30},
+                "word_level": {"c": 40, "d": 50},
+            },
+        }
+
+        with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
+            json.dump(report3, f)
+
+        cli_summarize.process(reports_dirname)
+
+        html_file = os.path.join(reports_dirname, "summary.html")
+        assert os.path.isfile(html_file)
+
+        with open(html_file, "r") as f:
+            contents = f.read()
+
+            assert "Number of reports: 2" in contents  # report3 is not included
--- a/qurator/dinglehopper/tests/test_integ_table_extraction.py
+++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@ -5,15 +5,15 @@ import os
 import pytest
 from lxml import etree as ET

-from .. import word_error_rate, words, page_text, alto_text
+from .. import alto_text, page_text, word_error_rate, words

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


@pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
-    # the ligature does not count → 2 errors
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
+    # So we have 3 changed words, the ligature does not count → 2 errors
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))

    gt_word_count = (
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@ -1,13 +1,11 @@
 import os
 import re
-
-import lxml.etree as ET
 import textwrap

-import pytest
+import lxml.etree as ET

-from .util import working_directory
 from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
+from .util import working_directory

 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

@ -161,7 +159,8 @@ def test_page_level():
    result = page_text(tree, textequiv_level="line")
    assert (
        result
-        == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
+        == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
+        + "Als er einsmals in dem Oberhauſe eine Bill we-"
    )


--- a/qurator/dinglehopper/tests/test_word_error_rate.py
+++ b/qurator/dinglehopper/tests/test_word_error_rate.py
@ -27,7 +27,8 @@ def test_words():
 def test_words_private_use_area():
    result = list(
        words(
-            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
+            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
+            "der Frau Amtmnnin das ver⸗\n"
            "ſproene zu berliefern."
        )
    )
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@ -1,8 +1,8 @@
+import os
 from itertools import zip_longest
 from typing import Iterable

 import colorama
-import os


 def diffprint(x, y):
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`__import__("pkg_resources").declare_namespace(__name__)`