Merge branch 'master' into performance

2025-10-07 14:49:57 +02:00 · 2024-01-02 20:22:38 +01:00 · 2024-01-02 20:22:38 +01:00 · 38fcbc8e1c
commit 38fcbc8e1c
parent 68a12f8f7f f077ce2e1b
101 changed files with 58154 additions and 199 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,23 +0,0 @@
 version: 2.1
 jobs:
  test:
    parameters:
      python-version:
        type: string
    docker:
      - image: cimg/python:<< parameters.python-version >>
    steps:
      - checkout
      - run: pip3 install --upgrade pip
      - run: pip3 install -r requirements.txt
      - run: pip3 install pytest
      - run: pytest
 workflows:
  all-tests:
    jobs:
      - test:
          matrix:
            parameters:
              python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
--- a/.editorconfig
+++ b/.editorconfig
@ -15,7 +15,7 @@ indent_size = 2
 [*.json]
 indent_size = 2
-insert_final_newline = false
+insert_final_newline = true
 # trailing spaces in markdown indicate word wrap
 [*.md]
--- a/.github/workflows/release-check-version-tag
+++ b/.github/workflows/release-check-version-tag
@ -0,0 +1,14 @@
 #!/bin/bash
 # We call setuptools.setup() here as we may rely on setuptools to interpret
 # a dynamic version field. (Reading pyproject.toml is not enough in that case.)
 expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
 actual_git_tag="$(git describe --tags)"
 if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
  echo "OK: Python package version $expected_git_tag matches git tag"
  exit 0
 else
  echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
  exit 1
 fi
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,69 @@
 name: release
 on:
  push:
    tags:
      - "v*.*.*"
 env:
  PYPI_URL: https://pypi.org/p/dinglehopper
 jobs:
  test:
    uses: ./.github/workflows/test.yml
  build:
    needs: test
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Upgrade pip
        run: python3 -m pip install --upgrade pip
      - name: Install setuptools
        run: |
          python3 -m pip install --upgrade setuptools
          # For OCR-D tools, we need setuptools-ocrd to get the version
          if [ -e ocrd-tool.json ]; then
            python3 -m pip install setuptools-ocrd
          fi
      - name: Check git tag vs package version
        run: .github/workflows/release-check-version-tag
      - name: Build package
        run: python3 -m pip install --upgrade build && python3 -m build
      - name: Upload dist
        uses: actions/upload-artifact@v3
        with:
          name: dist
          path: dist/
  github-release:
    needs: build
    runs-on: ubuntu-latest
    steps:
      - name: Download dist
        uses: actions/download-artifact@v3
        with:
          name: dist
          path: dist/
      - name: Create release on GitHub
        uses: softprops/action-gh-release@v1
        with:
          files: dist/*
  pypi-publish:
    needs: build
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: ${{ env.PYPI_URL }}
    permissions:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Download dist
        uses: actions/download-artifact@v3
        with:
          name: dist
          path: dist/
      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,76 @@
 name: test
 on:
  push:
    branches:
      - master
  pull_request:
    branches:
      - master
  schedule:
    - cron: "00 16 07 * *"  # = monthly
  # Allow manually running (from GitHub Web)
  workflow_dispatch:
  # Allow calling this workflow (e.g. from release workflow)
  workflow_call:
 jobs:
  test:
    strategy:
      fail-fast: false
      matrix:
        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
    # For Python 3.6, we need to fall back to Ubuntu 20.04
    runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
    env:
      test_results_dir: test-results-${{ matrix.python-version }}
    steps:
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Checkout
        uses: actions/checkout@v3
      - name: Update pip
        run: python3 -m pip install -U pip
      - name: Avoid compiling OpenCV and NumPy on Python 3.6
        run: |
          if python3 --version | grep -q "Python 3.6"; then
             pip install --prefer-binary -U opencv-python-headless numpy
          fi
      - name: Install requirements*.txt
        run: |
          for requirements_txt in requirements*.txt; do
            python3 -m pip install -r $requirements_txt;
          done
      - name: Test
        run: |
            cd src
            mkdir -p ../$test_results_dir
            python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
      - name: Upload test results
        uses: actions/upload-artifact@v3
        if: success() || failure()
        with:
          name: ${{ env.test_results_dir }}
          path: ${{ env.test_results_dir }}
      - name: Report tests
        uses: dorny/test-reporter@v1
        if: success() || failure()
        with:
          name: Results on Python ${{ matrix.python-version }}
          path: "${{env.test_results_dir }}/junit.xml"
          reporter: java-junit
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@ htmlcov/
 .venv
 env/
 venv/
 .python-version
 # mypy
 .mypy_cache/
@ -27,3 +28,4 @@ dmypy.json
 # Build artifacts
 /build
 /dist
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,36 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
    -   id: check-json
    -   id: check-toml
    -   id: check-yaml
    -   id: check-added-large-files
    -   id: check-ast
 -   repo: https://github.com/psf/black
    rev: 23.10.0
    hooks:
    -   id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.1.1
    hooks:
    -   args:
        - --fix
        - --exit-non-zero-on-fix
        id: ruff
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.6.1
    hooks:
    -   additional_dependencies:
        - types-setuptools
        id: mypy
 -   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
    rev: v0.1.0
    hooks:
    -   id: pre-commit-update
--- a/README-DEV.md
+++ b/README-DEV.md
@ -1,6 +1,6 @@
 Testing
 =======
-Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
+Use `pytest` to run the tests in [the tests directory](dinglehopper/tests):
 ```bash
 virtualenv -p /usr/bin/python3 venv
 . venv/bin/activate
@ -10,6 +10,7 @@ pytest
 ```
 ## Test running examples
 Only unit tests:
 ```bash
 pytest -m "not integration"
@ -27,11 +28,18 @@ pytest
 All tests with code coverage:
 ```bash
-pytest --cov=qurator --cov-report=html
+pytest --cov=dinglehopper --cov-report=html
 ```
 Static code analysis:
 ```bash
 pytest -k "not test" --flake8
 pytest -k "not test" --mypy
 pytest -k "not test" --ruff
 ```
 # How to use pre-commit
 This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
 - Install pre-commit, e.g. `pip install -r requirements-dev.txt`
 - Install the repo-local git hooks: `pre-commit install`
--- a/README.md
+++ b/README.md
@ -5,9 +5,13 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report.
+metrics and a word/character differences report. It also supports batch processing by
 generating, aggregating and summarizing multiple reports.
-[![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper)
+[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
 [![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
 [![License](https://img.shields.io/badge/License-Apache-blue)](#license)
 [![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)
 Goals
 -----
@ -19,15 +23,16 @@ Goals
 Installation
 ------------
-It's best to use pip, e.g.:
+
-~~~
+It's best to use pip to install the package from PyPI, e.g.:
-sudo pip install .
+```
-~~~
+pip install dinglehopper
 ```
 Usage
 -----
 ~~~
-Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
+Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
  Compare the PAGE/ALTO/text document GT against the document OCR.
@ -35,19 +40,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
  their text and falls back to plain text if no ALTO or PAGE is detected.
  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
-  In that case, use --no-metrics to disable the then meaningless metrics and
+  that case, use --no-metrics to disable the then meaningless metrics and also
-  also change the color scheme from green/red to blue.
+  change the color scheme from green/red to blue.
-  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+  The comparison report will be written to
-  $REPORT_PREFIX defaults to "report". The reports include the character
+  $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
-  error rate (CER) and the word error rate (WER).
+  to the current working directory and $REPORT_PREFIX defaults to "report".
  The reports include the character error rate (CER) and the word error rate
  (WER).
  By default, the text of PAGE files is extracted on 'region' level. You may
  use "--textequiv-level line" to extract from the level of TextLine tags.
 Options:
  --metrics / --no-metrics  Enable/disable metrics and green/red
  --differences BOOLEAN     Enable reporting character and word level
                            differences
  --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
  --progress                Show progress bar
  --help                    Show this message and exit.
@ -61,6 +70,43 @@ This generates `report.html` and `report.json`.
 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
 Batch comparison between folders of GT and OCR files can be done by simply providing
 folders:
 ~~~
 dinglehopper gt/ ocr/ report output_folder/
 ~~~
 This assumes that you have files with the same name in both folders, e.g.
 `gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
 The example generates reports for each set of files, with the prefix `report`, in the
 (automatically created) folder `output_folder/`.
 By default, the JSON report does not contain the character and word differences, only
 the calculated metrics. If you want to include the differences, use the
 `--differences` flag:
 ~~~
 dinglehopper gt/ ocr/ report output_folder/ --differences
 ~~~
 ### dinglehopper-summarize
 A set of (JSON) reports can be summarized into a single set of
 reports. This is useful after having generated reports in batch.
 Example:
 ~~~
 dinglehopper-summarize output_folder/
 ~~~
 This generates `summary.html` and `summary.json` in the same `output_folder`.
 If you are summarizing many reports and have used the `--differences` flag while
 generating them, it may be useful to limit the number of differences reported by using
 the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
 report, making it easier to open and navigate. Note that the JSON report will still
 contain all differences. Example:
 ~~~
 dinglehopper-summarize output_folder/ --occurences-threshold 10
 ~~~
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
--- a/ocrd-tool.json
+++ b/ocrd-tool.json
@ -1 +1 @@
-qurator/dinglehopper/ocrd-tool.json
+src/dinglehopper/ocrd-tool.json
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,70 @@
 [build-system]
 requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
 [project]
 name = "dinglehopper"
 authors = [
    {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
    {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
 ]
 description = "The OCR evaluation tool"
 readme = "README.md"
 requires-python = ">=3.6"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
 dynamic = ["version", "dependencies", "optional-dependencies"]
 # https://pypi.org/classifiers/
 classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Environment :: Console",
    "Intended Audience :: Science/Research",
    "Intended Audience :: Other Audience",
    "License :: OSI Approved :: Apache Software License",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3 :: Only",
    "Topic :: Scientific/Engineering :: Information Analysis",
    "Topic :: Text Processing",
 ]
 [project.scripts]
 dinglehopper = "dinglehopper.cli:main"
 dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
 dinglehopper-extract = "dinglehopper.cli_extract:main"
 dinglehopper-summarize = "dinglehopper.cli_summarize:main"
 ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
 [project.urls]
 Homepage = "https://github.com/qurator-spk/dinglehopper"
 Repository = "https://github.com/qurator-spk/dinglehopper.git"
 [tool.setuptools.dynamic]
 dependencies = {file = ["requirements.txt"]}
 optional-dependencies.dev = {file = ["requirements-dev.txt"]}
 [tool.setuptools.packages.find]
 where = ["src"]
 [tool.setuptools.package-data]
 dinglehopper = ["templates/*"]
 [tool.pytest.ini_options]
 minversion = 6.0
 addopts = "--strict-markers"
 markers = [
    "integration: integration tests",
 ]
 [tool.mypy]
 ignore_missing_imports = true
 [tool.ruff]
 select = ["E", "F", "I"]
 ignore = [
    "F811",  # multimethods are considered redefinitions by ruff
 ]
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +0,0 @@
 [pytest]
 markers =
    integration: integration tests
    serial
--- a/qurator/init.py
+++ b/qurator/init.py
@ -1 +0,0 @@
 __import__("pkg_resources").declare_namespace(__name__)
--- a/qurator/dinglehopper/init.py
+++ b/qurator/dinglehopper/init.py
@ -1,5 +0,0 @@
 from .ocr_files import *
 from .extracted_text import *
 from .character_error_rate import *
 from .word_error_rate import *
 from .align import *
--- a/qurator/dinglehopper/templates/report.html.js
+++ b/qurator/dinglehopper/templates/report.html.js
@ -1,15 +0,0 @@
 function find_diff_class(classes) {
    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
 }
 $(document).ready(function() {
    /* Enable Bootstrap tooltips */
    $('[data-toggle="tooltip"]').tooltip();
    $('.diff').mouseover(function() {
        find_diff_class($(this).attr('class')).addClass('diff-highlight');
    });
    $('.diff').mouseout(function() {
        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
    });
 });
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,5 +1,8 @@
 pytest
 pytest-flake8
 pytest-cov
 pytest-mypy
 black
 pre-commit
 ruff ; python_version >= "3.7"
 pytest-ruff ; python_version >= "3.7"
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ attrs
 multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.7.0
-six  # XXX workaround OCR-D/core#730
+chardet
--- a/setup.cfg
+++ b/setup.cfg
@ -1,12 +0,0 @@
 [flake8]
 max-line-length = 88
 extend-ignore = E203, W503
 [pylint]
 max-line-length = 88
 [pylint.messages_control]
 disable = C0330, C0326
 [mypy]
 ignore_missing_imports = True
--- a/setup.py
+++ b/setup.py
@ -1,34 +0,0 @@
 from io import open
 from setuptools import find_packages, setup
 with open("requirements.txt") as fp:
    install_requires = fp.read()
 with open('requirements-dev.txt') as fp:
    tests_require = fp.read()
 setup(
    name="dinglehopper",
    author="Mike Gerber, The QURATOR SPK Team",
    author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
    description="The OCR evaluation tool",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    keywords="qurator ocr",
    license="Apache",
    namespace_packages=["qurator"],
    packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
    install_requires=install_requires,
    tests_require=tests_require,
    package_data={
        "": ["*.json", "templates/*"],
    },
    entry_points={
        "console_scripts": [
            "dinglehopper=qurator.dinglehopper.cli:main",
            "dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
            "dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
            "ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
        ]
    },
 )
--- a/src/dinglehopper/init.py
+++ b/src/dinglehopper/init.py
@ -0,0 +1,33 @@
 from .align import align, score_hint, seq_align
 from .character_error_rate import character_error_rate, character_error_rate_n
 from .edit_distance import distance, editops
 from .extracted_text import ExtractedText
 from .ocr_files import (
    alto_namespace,
    alto_text,
    page_namespace,
    page_text,
    plain_text,
    text,
 )
 from .word_error_rate import word_error_rate, word_error_rate_n, words
 __all__ = [
    "editops",
    "distance",
    "align",
    "score_hint",
    "seq_align",
    "character_error_rate",
    "character_error_rate_n",
    "word_error_rate",
    "word_error_rate_n",
    "words",
    "ExtractedText",
    "alto_namespace",
    "alto_text",
    "page_namespace",
    "page_text",
    "plain_text",
    "text",
 ]
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,9 +1,12 @@
 import math
 import unicodedata
 from math import ceil
 from .edit_distance import *
 from rapidfuzz.distance import Levenshtein
 from .edit_distance import grapheme_clusters
 def align(t1, t2):
    """Align text."""
    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -1,20 +1,22 @@
 import os
 from collections import Counter
 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from ocrd_utils import initLogging
 from math import ceil
-from .character_error_rate import character_error_rate_n
+from dinglehopper.align import score_hint, seq_align
-from .word_error_rate import word_error_rate_n, words_normalized
+from dinglehopper.character_error_rate import character_error_rate_n
-from .align import seq_align, score_hint
+from dinglehopper.config import Config
-from .extracted_text import ExtractedText
+from dinglehopper.extracted_text import ExtractedText
-from .ocr_files import extract
+from dinglehopper.ocr_files import extract
-from .config import Config
+from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
+def gen_diff_report(
    gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
 ):
    gtx = ""
    ocrx = ""
@ -31,16 +33,12 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
        # Set Bootstrap tooltip to the segment id
        if id_:
-            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
+            html_custom_attrs += f'data-toggle="tooltip" title="{id_}"'
        if css_classes:
-            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
+            return f'<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'
                css_classes=css_classes,
                html_t=html_t,
                html_custom_attrs=html_custom_attrs,
            )
        else:
-            return "{html_t}".format(html_t=html_t)
+            return f"{html_t}"
    if isinstance(gt_in, ExtractedText):
        if not isinstance(ocr_in, ExtractedText):
@ -53,6 +51,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
    g_pos = 0
    o_pos = 0
    found_differences = []
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
        css_classes = None
        gt_id = None
@ -65,6 +65,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
                # Deletions and inserts only produce one id + None, UI must
                # support this, i.e. display for the one id produced
            if differences:
                found_differences.append(f"{g} :: {o}")
        gtx += joiner + format_thing(g, css_classes, gt_id)
        ocrx += joiner + format_thing(o, css_classes, ocr_id)
@ -73,13 +76,18 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
        if o is not None:
            o_pos += len(o)
-    return """
+    found_differences = dict(Counter(elem for elem in found_differences))
    return (
        """
        <div class="row">
           <div class="col-md-6 gt">{}</div>
           <div class="col-md-6 ocr">{}</div>
        </div>
        """.format(
            gtx, ocrx
        ),
        found_differences,
    )
@ -96,11 +104,20 @@ def json_float(value):
        return str(value)
-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(
    gt,
    ocr,
    report_prefix,
    reports_folder=".",
    *,
    metrics=True,
    differences=False,
    textequiv_level="region",
 ):
    """Check OCR result against GT.
-    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
+    The @click decorators change the signature of the decorated functions, so we keep
-    Click on a wrapper.
+    this undecorated version and use Click on a wrapper.
    """
    gt_text = extract(gt, textequiv_level=textequiv_level)
@ -109,15 +126,25 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    ocr_words = words_normalized(ocr_text)
    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    char_diff_report = gen_diff_report(
+    char_diff_report, diff_c = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·",
+        gt_text,
-        score_hint=score_hint(cer, n_characters)
+        ocr_text,
        css_prefix="c",
        joiner="",
        none="·",
        score_hint=score_hint(cer, n_characters),
        differences=differences,
    )
    wer, n_words = word_error_rate_n(gt_words, ocr_words)
-    word_diff_report = gen_diff_report(
+    word_diff_report, diff_w = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
+        gt_words,
-        score_hint=score_hint(wer, n_words)
+        ocr_words,
        css_prefix="w",
        joiner=" ",
        none="⋯",
        score_hint=score_hint(wer, n_words),
        differences=differences,
    )
    env = Environment(
@ -129,7 +156,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    for report_suffix in (".html", ".json"):
        template_fn = "report" + report_suffix + ".j2"
-        out_fn = report_prefix + report_suffix
+
        if not os.path.isdir(reports_folder):
            os.mkdir(reports_folder)
        out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
        template = env.get_template(template_fn)
        template.stream(
@ -142,16 +173,46 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
            char_diff_report=char_diff_report,
            word_diff_report=word_diff_report,
            metrics=metrics,
            differences=differences,
            diff_c=diff_c,
            diff_w=diff_w,
        ).dump(out_fn)
 def process_dir(
    gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
 ):
    for gt_file in os.listdir(gt):
        gt_file_path = os.path.join(gt, gt_file)
        ocr_file_path = os.path.join(ocr, gt_file)
        if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
            process(
                gt_file_path,
                ocr_file_path,
                f"{gt_file}-{report_prefix}",
                reports_folder=reports_folder,
                metrics=metrics,
                differences=differences,
                textequiv_level=textequiv_level,
            )
        else:
            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@click.command()
@click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.argument("reports_folder", type=click.Path(), default=".")
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
@click.option(
    "--differences",
    default=False,
    help="Enable reporting character and word level differences",
 )
@click.option(
    "--textequiv-level",
    default="region",
@ -159,7 +220,16 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
    metavar="LEVEL",
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(
    gt,
    ocr,
    report_prefix,
    reports_folder,
    metrics,
    differences,
    textequiv_level,
    progress,
 ):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.
@ -171,7 +241,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
-    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
    where $REPORTS_FOLDER defaults to the current working directory and
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).
@ -180,7 +251,31 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    initLogging()
    Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    if os.path.isdir(gt):
        if not os.path.isdir(ocr):
            raise click.BadParameter(
                "OCR must be a directory if GT is a directory", param_hint="ocr"
            )
        else:
            process_dir(
                gt,
                ocr,
                report_prefix,
                reports_folder,
                metrics,
                differences,
                textequiv_level,
            )
    else:
        process(
            gt,
            ocr,
            report_prefix,
            reports_folder,
            metrics=metrics,
            differences=differences,
            textequiv_level=textequiv_level,
        )
 if __name__ == "__main__":
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@ -1,15 +1,15 @@
 import os
 import itertools
 import os
 import click
 from jinja2 import Environment, FileSystemLoader
 from ocrd_utils import initLogging
 from math import ceil
 from .align import score_hint
 from .character_error_rate import character_error_rate_n
 from .word_error_rate import word_error_rate_n, words_normalized
 from .ocr_files import plain_extract
 from .cli import gen_diff_report, json_float
 from .ocr_files import plain_extract
 from .word_error_rate import word_error_rate_n, words_normalized
 def all_equal(iterable):
@ -75,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
        # Generate diff reports
        char_diff_report += gen_diff_report(
-            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
+            gt_text,
-            score_hint=score_hint(l_cer, l_n_characters)
+            ocr_text,
            css_prefix="l{0}-c".format(k),
            joiner="",
            none="·",
            score_hint=score_hint(l_cer, l_n_characters),
        )
        word_diff_report += gen_diff_report(
-            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
+            gt_words,
-            score_hint=score_hint(l_wer, l_n_words)
+            ocr_words,
            css_prefix="l{0}-w".format(k),
            joiner=" ",
            none="⋯",
            score_hint=score_hint(l_wer, l_n_words),
        )
    env = Environment(
--- a/src/dinglehopper/cli_summarize.py
+++ b/src/dinglehopper/cli_summarize.py
@ -0,0 +1,106 @@
 import json
 import os
 import click
 from jinja2 import Environment, FileSystemLoader
 from ocrd_utils import initLogging
 from dinglehopper.cli import json_float
 def process(reports_folder, occurrences_threshold=1):
    cer_list = []
    wer_list = []
    cer_sum = 0
    wer_sum = 0
    diff_c = {}
    diff_w = {}
    for report in os.listdir(reports_folder):
        if report.endswith(".json"):
            with open(os.path.join(reports_folder, report), "r") as f:
                report_data = json.load(f)
                if "cer" not in report_data or "wer" not in report_data:
                    click.echo(
                        f"Skipping {report} because it does not contain CER and WER"
                    )
                    continue
                cer = report_data["cer"]
                wer = report_data["wer"]
                cer_list.append(cer)
                wer_list.append(wer)
                cer_sum += cer
                wer_sum += wer
                try:
                    for key, value in report_data["differences"][
                        "character_level"
                    ].items():
                        diff_c[key] = diff_c.get(key, 0) + value
                    for key, value in report_data["differences"]["word_level"].items():
                        diff_w[key] = diff_w.get(key, 0) + value
                except KeyError:
                    pass
    if len(cer_list) == 0:
        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
        return
    cer_avg = cer_sum / len(cer_list)
    wer_avg = wer_sum / len(wer_list)
    print(f"Number of reports: {len(cer_list)}")
    print(f"Average CER: {cer_avg}")
    print(f"Average WER: {wer_avg}")
    print(f"Sum of common mistakes: {cer_sum}")
    print(f"Sum of common mistakes: {wer_sum}")
    env = Environment(
        loader=FileSystemLoader(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
        )
    )
    env.filters["json_float"] = json_float
    for report_suffix in (".html", ".json"):
        template_fn = "summary" + report_suffix + ".j2"
        out_fn = os.path.join(reports_folder, "summary" + report_suffix)
        template = env.get_template(template_fn)
        template.stream(
            num_reports=len(cer_list),
            cer_avg=cer_avg,
            wer_avg=wer_avg,
            diff_c=diff_c,
            diff_w=diff_w,
            occurrences_threshold=occurrences_threshold,
        ).dump(out_fn)
@click.command()
@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
@click.option(
    "--occurrences-threshold",
    type=int,
    default=1,
    help="Only show differences that occur at least this many times.",
 )
 def main(reports_folder, occurrences_threshold):
    """
    Summarize the results from multiple reports generated earlier by dinglehopper.
    It calculates the average CER and WER, as well as a sum of common mistakes.
    Reports include lists of mistakes and their occurrences.
    You may use a threshold to reduce the file size of the HTML report by only showing
    mistakes whose number of occurrences is above the threshold. The JSON report will
    always contain all mistakes.
    All JSON files in the provided folder will be gathered and summarized.
    """
    initLogging()
    process(reports_folder, occurrences_threshold)
 if __name__ == "__main__":
    main()
--- a/qurator/dinglehopper/config.py
+++ b/qurator/dinglehopper/config.py
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,8 +1,8 @@
 import unicodedata
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
 from uniseg.graphemecluster import grapheme_clusters
 from .extracted_text import ExtractedText
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
--- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb
--- a/qurator/dinglehopper/notebooks/Unicode
+++ b/qurator/dinglehopper/notebooks/Unicode
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -2,6 +2,7 @@ import os
 import sys
 from typing import Iterator
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
 from uniseg.graphemecluster import grapheme_clusters
@ -12,8 +13,8 @@ from .extracted_text import ExtractedText, normalize_sbb
 def alto_namespace(tree: ET.ElementTree) -> str:
    """Return the ALTO namespace used in the given ElementTree.
-    This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
+    This relies on the assumption that, in any given ALTO file, the root element has the
-    check if the files uses any valid ALTO namespace.
+    local name "alto". We do not check if the files uses any valid ALTO namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "alto":
@ -48,8 +49,9 @@ def alto_text(tree):
 def page_namespace(tree):
    """Return the PAGE content namespace used in the given ElementTree.
-    This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
+    This relies on the assumption that, in any given PAGE content file, the root element
-    do not check if the files uses any valid PAGE namespace.
+    has the local name "PcGts". We do not check if the files uses any valid PAGE
    namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "PcGts":
@ -135,6 +137,10 @@ def page_text(tree, *, textequiv_level="region"):
    return page_extract(tree, textequiv_level=textequiv_level).text
 def detect_encoding(filename):
    return chardet.detect(open(filename, "rb").read(1024))["encoding"]
 def plain_extract(filename, include_filename_in_id=False):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
@ -149,7 +155,8 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )
-    with open(filename, "r") as f:
+    fileencoding = detect_encoding(filename)
    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
            [make_segment(no, line) for no, line in enumerate(f.readlines())],
@ -171,7 +178,7 @@ def extract(filename, *, textequiv_level="region"):
    """
    try:
        tree = ET.parse(filename)
-    except XMLSyntaxError:
+    except (XMLSyntaxError, UnicodeDecodeError):
        return plain_extract(filename)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
--- a/qurator/dinglehopper/ocrd-tool.json
+++ b/qurator/dinglehopper/ocrd-tool.json
@ -1,4 +1,5 @@
 {
  "version": "0.9.4",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@ -4,7 +4,7 @@ import os
 import click
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
+from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
 from pkg_resources import resource_string
 from .cli import process as cli_process
--- a/qurator/dinglehopper/templates/report.html.j2
+++ b/qurator/dinglehopper/templates/report.html.j2
@ -26,6 +26,22 @@
      border: 2px solid;
      border-radius: 5px;
    }
    .row {
        margin-bottom: 20px;
    }
    table {
        width: 100%;
    }
    th {
        cursor: pointer;
    }
    th:hover {
        background-color: #eee;
    }
    </style>
 </head>
 <body>
@ -50,6 +66,32 @@
 <h2>Word differences</h2>
 {{ word_diff_report }}
 {%- if differences %}
 {% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
 <div class="row">
 {% for section in sections %}
    <div class="col-md-6">
        <h2>{{ section['title'] }}</h2>
        <table>
            <thead>
            <tr>
                <th>GT</th>
                <th>OCR</th>
                <th>Occurrences</th>
            </tr>
            {% for gt_ocr, occurrences in section['data'].items() %}
                <tr>
                    <td>{{ gt_ocr.split("::")[0] }}</td>
                    <td>{{ gt_ocr.split("::")[1] }}</td>
                    <td>{{ occurrences }}</td>
                </tr>
            {% endfor %}
        </table>
    </div>
 {% endfor %}
 </div>
 {%- endif %}
 </div>
--- a/src/dinglehopper/templates/report.html.js
+++ b/src/dinglehopper/templates/report.html.js
@ -0,0 +1,39 @@
 function find_diff_class(classes) {
    return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
 }
 $(document).ready(function() {
    /* Enable Bootstrap tooltips */
    $('[data-toggle="tooltip"]').tooltip();
    $('.diff').mouseover(function() {
        find_diff_class($(this).attr('class')).addClass('diff-highlight');
    });
    $('.diff').mouseout(function() {
        find_diff_class($(this).attr('class')).removeClass('diff-highlight');
    });
    /* Sort this column of the table */
    $('th').click(function () {
        var table = $(this).closest('table');
        var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
        this.asc = !this.asc;
        if (!this.asc) {
            rows = rows.reverse();
        }
        for (var i = 0; i < rows.length; i++) {
            table.children('tbody').append(rows[i]);
        }
    });
    function compareRows(index) {
        return function (row1, row2) {
            var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
            var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
            return cell1.localeCompare(cell2, undefined, {
                numeric: true,
                sensitivity: 'base'
            });
        }
    }
 });
--- a/qurator/dinglehopper/templates/report.json.j2
+++ b/qurator/dinglehopper/templates/report.json.j2
@ -4,6 +4,12 @@
 {% if metrics %}
    "cer": {{ cer|json_float }},
    "wer": {{ wer|json_float }},
 {% endif %}
 {% if differences %}
    "differences": {
        "character_level": {{ diff_c|tojson }},
        "word_level": {{ diff_w|tojson }}
    },
 {% endif %}
    "n_characters": {{ n_characters }},
    "n_words": {{ n_words }}
--- a/src/dinglehopper/templates/summary.html.j2
+++ b/src/dinglehopper/templates/summary.html.j2
@ -0,0 +1,136 @@
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
    <style type="text/css">
    {% if metrics %}
    .gt .diff {
        color: green;
    }
    .ocr .diff {
        color: red;
    }
    {% else %}
    .gt .diff, .ocr .diff {
        color: blue;
    }
    {% endif %}
    .ellipsis {
        opacity: 0.5;
        font-style: italic;
    }
    .diff-highlight {
      border: 2px solid;
      border-radius: 5px;
    }
    .row {
        margin-bottom: 20px;
    }
    table {
        width: 100%;
    }
    .cer {
        flex-direction: column;
    }
    tr:hover {
        background-color: #f5f5f5;
    }
    th {
        cursor: pointer;
    }
    th:hover {
        background-color: #eee;
    }
    td {
        min-width: 100px;
    }
    td:hover {
        background-color: #eee;
    }
    </style>
 </head>
 <body>
 <div class="container">
 <div class="row">
    <h1>Summary of all reports</h1>
 </div>
 <div class="row">
    <p>Number of reports: {{ num_reports }}</p>
 </div>
 {% if cer_avg and wer_avg -%}
 <div class="row">
    <h2>Metrics</h2>
 </div>
 <div class="row cer">
    <p>Average CER: {{ cer_avg|round(4) }}</p>
    <p>Average WER: {{ wer_avg|round(4) }}</p>
 </div>
 {% endif %}
 {%- if diff_c and diff_w %}
 {%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
 <div class="row">
 {%- for section in sections %}
    <div class="col-md-6">
        <h2>{{ section['title'] }}</h2>
        <table>
            <thead>
            <tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
            </thead>
            {%- set num_omitted = namespace(value=0) -%}
            {% for gt_ocr, occurrences in section['data'].items() -%}
                {% if occurrences < occurrences_threshold -%}
                    {%- set num_omitted.value = num_omitted.value + 1 %}
                {%- else -%}
                    {%- set gt = gt_ocr.split(" :: ")[0] %}
                    {%- set ocr = gt_ocr.split(" :: ")[1] %}
                    <tr>
                        <td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
                        <td title="{{ ocr|urlencode }}">{{ ocr }}</td >
                        <td>{{ occurrences }}</td>
                    </tr>
                {%- endif %}
            {%- endfor %}
            {% if num_omitted.value > 0  and occurrences_threshold > 1 -%}
                <p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
                {%- set num_omitted.value = 0 %}
            {%- endif %}
        </table>
    </div>
 {%- endfor %}
 </div>
 {%- endif %}
 </div>
 <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
 <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
 <script>
 {% include 'report.html.js' %}
 </script>
 </body>
 </html>
--- a/src/dinglehopper/templates/summary.json.j2
+++ b/src/dinglehopper/templates/summary.json.j2
@ -0,0 +1,15 @@
 {
 "num_reports": {{ num_reports}}
 {%- if cer_avg and wer_avg %}
    ,
    "cer_avg": {{ cer_avg|json_float }},
    "wer_avg": {{ wer_avg|json_float }}
 {%- endif %}
 {%- if diff_c and wer_avg %}
    ,
    "differences": {
        "character_level": {{ diff_c|tojson }},
        "word_level": {{ diff_w|tojson }}
    }
 {%- endif %}
 }
--- a/qurator/dinglehopper/tests/init.py
+++ b/qurator/dinglehopper/tests/init.py
--- a/qurator/dinglehopper/tests/data/00000119.tif
+++ b/qurator/dinglehopper/tests/data/00000119.tif
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
--- a/src/dinglehopper/tests/data/bigger-texts/00008228/00008228-00236534.gt4hist.xml
+++ b/src/dinglehopper/tests/data/bigger-texts/00008228/00008228-00236534.gt4hist.xml
--- a/src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
+++ b/src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
--- a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.gt.page.xml
--- a/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml
+++ b/qurator/dinglehopper/tests/data/brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml
--- a/qurator/dinglehopper/tests/data/test-gt.page2018.xml
+++ b/qurator/dinglehopper/tests/data/test-gt.page2018.xml
--- a/src/dinglehopper/tests/data/directory-test/gt/2.xml
+++ b/src/dinglehopper/tests/data/directory-test/gt/2.xml
--- a/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
+++ b/qurator/dinglehopper/tests/data/test-fake-ocr.page2018.xml
--- a/src/dinglehopper/tests/data/directory-test/ocr/2.xml
+++ b/src/dinglehopper/tests/data/directory-test/ocr/2.xml
--- a/src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
+++ b/src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
--- a/qurator/dinglehopper/tests/data/levels-are-different.page.xml
+++ b/qurator/dinglehopper/tests/data/levels-are-different.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.pdf
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan-bad.tif
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.gt.page.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.pdf
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum-scan.tif
--- a/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
+++ b/qurator/dinglehopper/tests/data/lorem-ipsum/lorem-ipsum.odt
--- a/qurator/dinglehopper/tests/data/mixed-regions.page.xml
+++ b/qurator/dinglehopper/tests/data/mixed-regions.page.xml
--- a/qurator/dinglehopper/tests/data/order.page.xml
+++ b/qurator/dinglehopper/tests/data/order.page.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-no-reading-order.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0001.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-order-0002.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-region.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-region.xml
--- a/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
+++ b/qurator/dinglehopper/tests/data/table-order/table-unordered.xml
--- a/src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
+++ b/src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
--- a/src/dinglehopper/tests/data/test-gt.page2018.xml
+++ b/src/dinglehopper/tests/data/test-gt.page2018.xml
--- a/qurator/dinglehopper/tests/data/test.alto1.xml
+++ b/qurator/dinglehopper/tests/data/test.alto1.xml
--- a/qurator/dinglehopper/tests/data/test.alto2.xml
+++ b/qurator/dinglehopper/tests/data/test.alto2.xml
--- a/qurator/dinglehopper/tests/data/test.alto3.xml
+++ b/qurator/dinglehopper/tests/data/test.alto3.xml
--- a/src/dinglehopper/tests/data/test.page2018.xml
+++ b/src/dinglehopper/tests/data/test.page2018.xml
--- a/qurator/dinglehopper/tests/data/test.txt
+++ b/qurator/dinglehopper/tests/data/test.txt
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/462875_0008.jpg
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/462875_0008.jpg
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-GT_0008.xml
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-GT_0008.xml
--- a/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-OCR-TESS_0008.xml
+++ b/qurator/dinglehopper/tests/data/unused-larex-indexed-textequiv-jkamlah/OCR-D-OCR-TESS_0008.xml
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@ -6,7 +6,7 @@ import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters
-from .. import seq_align, ExtractedText
+from .. import ExtractedText, seq_align
 def test_text():
@ -30,12 +30,20 @@ def test_text():
 def test_normalization_check():
    with pytest.raises(ValueError, match=r".*is not in NFC.*"):
-        ExtractedText("foo", None, None,
+        ExtractedText(
            "foo",
            None,
            None,
            unicodedata.normalize("NFD", "Schlyñ"),
-                      grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")))
+            grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
-    assert ExtractedText("foo", None, None,
+        )
    assert ExtractedText(
        "foo",
        None,
        None,
        unicodedata.normalize("NFC", "Schlyñ"),
-                         grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")))
+        grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
    )
 AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@ -1,7 +1,9 @@
 import math
 import pytest
 from .. import align, distance, score_hint, seq_align
 from .util import unzip
 from .. import align, seq_align, distance, score_hint
 def test_left_empty():
@ -72,7 +74,8 @@ def test_with_some_fake_ocr_errors():
    result = list(
        align(
            "Über die vielen Sorgen wegen desselben vergaß",
-            "SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
+            "SomeJunk MoreJunk "
            + "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
        )
    )
    left, right = unzip(result)
@ -183,6 +186,7 @@ def test_lines_similar():
    # Test __eq__ (i.e. is it a substitution or a similar string?)
    assert list(left)[0] == list(right)[0]
 def test_score_hint():
    assert score_hint(0.5, 23) == 12  # int(ceil())
    assert score_hint(math.inf, 12345) is None
--- a/qurator/dinglehopper/tests/test_character_error_rate.py
+++ b/qurator/dinglehopper/tests/test_character_error_rate.py
@ -36,6 +36,7 @@ def test_character_error_rate_hard():
        len(s2) == 7
    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
-    # Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
+    # Both strings have the same length in terms of grapheme clusters. So the CER should
    # be symmetrical.
    assert character_error_rate(s2, s1) == 1 / 6
    assert character_error_rate(s1, s2) == 1 / 6
--- a/qurator/dinglehopper/tests/test_edit_distance.py
+++ b/qurator/dinglehopper/tests/test_edit_distance.py
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
--- a/qurator/dinglehopper/tests/test_integ_align.py
+++ b/qurator/dinglehopper/tests/test_integ_align.py
@ -15,7 +15,9 @@ def test_align_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    # → 2 elements in the alignment should be different, the ligature is
    # (currently) not counted due to normalization.
-    # NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
+    #
    # NOTE: In this example, it doesn't matter that we work with "characters", not
    # grapheme clusters.
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
--- a/src/dinglehopper/tests/test_integ_bigger_texts.py
+++ b/src/dinglehopper/tests/test_integ_bigger_texts.py
@ -0,0 +1,28 @@
 from __future__ import division, print_function
 import os
 import pytest
 from lxml import etree as ET
 from .. import alto_text, character_error_rate, page_text
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_bigger_texts():
    gt = page_text(
        ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
    )
    ocr = alto_text(
        ET.parse(
            os.path.join(
                data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
            )
        )
    )
    # Only interested in a result here: In earlier versions this would have used
    # tens of GB of RAM and should now not break a sweat.
    assert character_error_rate(gt, ocr) >= 0.0
--- a/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@ -6,7 +6,7 @@ import pytest
 from lxml import etree as ET
 from uniseg.graphemecluster import grapheme_clusters
-from .. import character_error_rate, page_text, alto_text
+from .. import alto_text, character_error_rate, page_text
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
--- a/src/dinglehopper/tests/test_integ_cli_dir.py
+++ b/src/dinglehopper/tests/test_integ_cli_dir.py
@ -0,0 +1,53 @@
 import os
 import pytest
 from ocrd_utils import initLogging
 from dinglehopper.cli import process_dir
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_cli_directory(tmp_path):
    """
    Test that the cli/process_dir() processes a directory of files and
    yields JSON and HTML reports.
    """
    initLogging()
    process_dir(
        os.path.join(data_dir, "directory-test", "gt"),
        os.path.join(data_dir, "directory-test", "ocr"),
        "report",
        str(tmp_path / "reports"),
        False,
        True,
        "line",
    )
    assert os.path.exists(tmp_path / "reports/1.xml-report.json")
    assert os.path.exists(tmp_path / "reports/1.xml-report.html")
    assert os.path.exists(tmp_path / "reports/2.xml-report.json")
    assert os.path.exists(tmp_path / "reports/2.xml-report.html")
@pytest.mark.integration
 def test_cli_fail_without_gt(tmp_path):
    """
    Test that the cli/process_dir skips a file if there is no corresponding file
    in the other directory.
    """
    initLogging()
    process_dir(
        os.path.join(data_dir, "directory-test", "gt"),
        os.path.join(data_dir, "directory-test", "ocr"),
        "report",
        str(tmp_path / "reports"),
        False,
        True,
        "line",
    )
    assert len(os.listdir(tmp_path / "reports")) == 2 * 2
--- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py
@ -1,9 +1,9 @@
 import json
 import pytest
 from .util import working_directory
 from ..cli import process
 from .util import working_directory
@pytest.mark.integration
--- a/src/dinglehopper/tests/test_integ_differences.py
+++ b/src/dinglehopper/tests/test_integ_differences.py
@ -0,0 +1,37 @@
 import json
 import os
 import pytest
 from ocrd_utils import initLogging
 from dinglehopper.cli import process
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_cli_differences(tmp_path):
    """Test that the cli/process() yields a JSON report that includes
    the differences found between the GT and OCR text"""
    initLogging()
    process(
        os.path.join(data_dir, "test-gt.page2018.xml"),
        os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
        "report",
        tmp_path,
        differences=True,
    )
    assert os.path.exists(tmp_path / "report.json")
    with open(tmp_path / "report.json", "r") as jsonf:
        j = json.load(jsonf)
        assert j["differences"] == {
            "character_level": {"n :: m": 1, "ſ :: f": 1},
            "word_level": {
                "Augenblick :: Augemblick": 1,
                "Verſprochene :: Verfprochene": 1,
            },
        }
--- a/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_edit_distance_ocr.py
@ -5,7 +5,7 @@ import os
 import pytest
 from lxml import etree as ET
-from .. import distance, page_text, alto_text
+from .. import alto_text, distance, page_text
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
--- a/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/qurator/dinglehopper/tests/test_integ_ocrd_cli.py
@ -1,21 +1,20 @@
 import json
 import os
 import shutil
 import json
 import sys
 from pathlib import Path
 import pytest
 from click.testing import CliRunner
 from .util import working_directory
 from ..ocrd_cli import ocrd_dinglehopper
 from .util import working_directory
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
-@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix")
+@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
 def test_ocrd_cli(tmp_path):
    """Test OCR-D interface"""
--- a/src/dinglehopper/tests/test_integ_summarize.py
+++ b/src/dinglehopper/tests/test_integ_summarize.py
@ -0,0 +1,110 @@
 import json
 import os
 import pytest
 from .. import cli_summarize
 from .util import working_directory
 expected_cer_avg = (0.05 + 0.10) / 2
 expected_wer_avg = (0.15 + 0.20) / 2
 expected_diff_c = {"a": 30, "b": 50}
 expected_diff_w = {"c": 70, "d": 90}
@pytest.fixture
 def create_summaries(tmp_path):
    """Create two summary reports with mock data"""
    reports_dirname = tmp_path / "reports"
    reports_dirname.mkdir()
    report1 = {
        "cer": 0.05,
        "wer": 0.15,
        "differences": {
            "character_level": {"a": 10, "b": 20},
            "word_level": {"c": 30, "d": 40},
        },
    }
    report2 = {
        "cer": 0.10,
        "wer": 0.20,
        "differences": {
            "character_level": {"a": 20, "b": 30},
            "word_level": {"c": 40, "d": 50},
        },
    }
    with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
        json.dump(report1, f)
    with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
        json.dump(report2, f)
    return str(reports_dirname)
@pytest.mark.integration
 def test_cli_summarize_json(tmp_path, create_summaries):
    """Test that the cli/process() yields a summarized JSON report"""
    with working_directory(tmp_path):
        reports_dirname = create_summaries
        cli_summarize.process(reports_dirname)
        with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
            summary_data = json.load(f)
        assert summary_data["num_reports"] == 2
        assert summary_data["cer_avg"] == expected_cer_avg
        assert summary_data["wer_avg"] == expected_wer_avg
        assert summary_data["differences"]["character_level"] == expected_diff_c
        assert summary_data["differences"]["word_level"] == expected_diff_w
@pytest.mark.integration
 def test_cli_summarize_html(tmp_path, create_summaries):
    """Test that the cli/process() yields an HTML report"""
    with working_directory(tmp_path):
        reports_dirname = create_summaries
        cli_summarize.process(reports_dirname)
        html_file = os.path.join(reports_dirname, "summary.html")
        assert os.path.isfile(html_file)
        with open(html_file, "r") as f:
            contents = f.read()
            assert len(contents) > 0
            assert "Number of reports: 2" in contents
            assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
            assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
@pytest.mark.integration
 def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
    """
    Test that the cli/process() does not include reports that are missing a WER value.
    """
    with working_directory(tmp_path):
        reports_dirname = create_summaries
        # This third report has no WER value and should not be included in the summary
        report3 = {
            "cer": 0.10,
            "differences": {
                "character_level": {"a": 20, "b": 30},
                "word_level": {"c": 40, "d": 50},
            },
        }
        with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
            json.dump(report3, f)
        cli_summarize.process(reports_dirname)
        html_file = os.path.join(reports_dirname, "summary.html")
        assert os.path.isfile(html_file)
        with open(html_file, "r") as f:
            contents = f.read()
            assert "Number of reports: 2" in contents  # report3 is not included
--- a/qurator/dinglehopper/tests/test_integ_table_extraction.py
+++ b/qurator/dinglehopper/tests/test_integ_table_extraction.py
--- a/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/qurator/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@ -5,15 +5,15 @@ import os
 import pytest
 from lxml import etree as ET
-from .. import word_error_rate, words, page_text, alto_text
+from .. import alto_text, page_text, word_error_rate, words
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_word_error_rate_between_page_files():
-    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
+    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
-    # the ligature does not count → 2 errors
+    # So we have 3 changed words, the ligature does not count → 2 errors
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
    gt_word_count = (
--- a/qurator/dinglehopper/tests/test_ocr_files.py
+++ b/qurator/dinglehopper/tests/test_ocr_files.py
@ -1,13 +1,11 @@
 import os
 import re
 import lxml.etree as ET
 import textwrap
-import pytest
+import lxml.etree as ET
 from .util import working_directory
 from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
 from .util import working_directory
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@ -161,7 +159,8 @@ def test_page_level():
    result = page_text(tree, textequiv_level="line")
    assert (
        result
-        == "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
+        == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
        + "Als er einsmals in dem Oberhauſe eine Bill we-"
    )
--- a/qurator/dinglehopper/tests/test_word_error_rate.py
+++ b/qurator/dinglehopper/tests/test_word_error_rate.py
@ -27,7 +27,8 @@ def test_words():
 def test_words_private_use_area():
    result = list(
        words(
-            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
+            "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
            "der Frau Amtmnnin das ver⸗\n"
            "ſproene zu berliefern."
        )
    )
--- a/qurator/dinglehopper/tests/util.py
+++ b/qurator/dinglehopper/tests/util.py
@ -1,8 +1,8 @@
 import os
 from itertools import zip_longest
 from typing import Iterable
 import colorama
 import os
 def diffprint(x, y):
--- a/Show more
+++ b/Show more
		`@ -1 +1 @@`
			`qurator/dinglehopper/ocrd-tool.json`				`src/dinglehopper/ocrd-tool.json`
		`@ -1 +0,0 @@`
			`__import__("pkg_resources").declare_namespace(__name__)`