Merge pull request #144 from qurator-spk/fix/make-test-results-clearer

✔ GitHub Actions: Make reporting results clearer
59 changed files with 1097 additions and 342 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,20 +0,0 @@
-version: 2.1
-
-jobs:
-  black:
-    parameters:
-      python-version:
-        type: string
-    docker:
-      - image: cimg/python:<< parameters.python-version >>
-    steps:
-      - checkout
-      - run: pip3 install --upgrade pip
-      - run: pip3 install black
-      - run: black .
-
-workflows:
-  black:
-    jobs:
-      - black:
-          python-version: "3.11"
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,5 @@
+src/dinglehopper/tests
+dist
+build
+*.egg-info
+.git
--- a/.editorconfig
+++ b/.editorconfig
@ -15,7 +15,7 @@ indent_size = 2

 [*.json]
 indent_size = 2
-insert_final_newline = false
+insert_final_newline = true

 # trailing spaces in markdown indicate word wrap
 [*.md]
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Upgrade pip
        run: python3 -m pip install --upgrade pip
      - name: Install setuptools
@ -32,7 +32,7 @@ jobs:
      - name: Build package
        run: python3 -m pip install --upgrade build && python3 -m build
      - name: Upload dist
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: dist
          path: dist/
@ -42,7 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Download dist
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist/
@ -61,7 +61,7 @@ jobs:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Download dist
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist/
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -1,4 +1,4 @@
-name: test
+name: 'Test'

 on:

@ -6,6 +6,10 @@ on:
    branches:
      - master

+  pull_request:
+    branches:
+      - master
+
  schedule:
    - cron: "00 16 07 * *"  # = monthly

@ -21,30 +25,27 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]

-    # For Python 3.6, we need to fall back to Ubuntu 20.04
-    runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
-
-    env:
-      test_results_dir: test-results-${{ matrix.python-version }}
+    runs-on: "ubuntu-latest"

    steps:
      - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
+          allow-prereleases: true

      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
+      - name: Install possible lxml build requirements (if building from source)
+        run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
+      - name: Install possible shapely build requirements (if building from source)
+        run: sudo apt-get install -y libgeos-dev

      - name: Update pip
        run: python3 -m pip install -U pip
-      - name: Avoid compiling OpenCV and NumPy on Python 3.6
-        run: |
-          if python3 --version | grep -q "Python 3.6"; then
-             pip install --prefer-binary -U opencv-python-headless numpy
-          fi
      - name: Install requirements*.txt
        run: |
          for requirements_txt in requirements*.txt; do
@ -54,19 +55,10 @@ jobs:
      - name: Test
        run: |
            cd src
-            mkdir -p ../$test_results_dir
-            python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
+            python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
      - name: Upload test results
-        uses: actions/upload-artifact@v3
-        if: success() || failure()
-        with:
-          name: ${{ env.test_results_dir }}
-          path: ${{ env.test_results_dir }}
-
-      - name: Report tests
-        uses: dorny/test-reporter@v1
+        uses: actions/upload-artifact@v4
        if: success() || failure()
        with:
-          name: Results on Python ${{ matrix.python-version }}
-          path: "${{env.test_results_dir }}/junit.xml"
-          reporter: java-junit
+          name: test-results-${{matrix.python-version}}
+          path: ${{matrix.python-version}}-junit.xml
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@ -0,0 +1,20 @@
+name: 'Test - Report results'
+on:
+  workflow_run:
+    workflows: ['test']
+    types:
+      - completed
+permissions:
+  contents: read
+  actions: read
+  checks: write
+jobs:
+  report:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: dorny/test-reporter@v1
+        with:
+          artifact: /test-results-(.*)/
+          name: 'test - Results ($1)'
+          path: '*junit.xml'
+          reporter: java-junit
--- a/.gitignore
+++ b/.gitignore
@ -25,6 +25,8 @@ dmypy.json

 # User-specific stuff
 .idea
+.*.swp

 # Build artifacts
 /build
+/dist
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,16 @@
+variables:
+  http_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
+  https_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
+  HTTP_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
+  HTTPS_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
+
+stages:
+  - triggers
+
+mirror:
+  stage: triggers
+  trigger:
+    include: .gitlab/mirror.yml
+    strategy: depend
+  rules:
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@ -0,0 +1,47 @@
+stages:
+    - check
+    - pull
+    - push
+
+default:
+    image: debian
+
+
+check:
+    stage: check
+
+    script:
+        - whoami; env
+        - if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
+
+
+pull-gitlab:
+    stage: pull
+    script:
+        - echo "This is redundant"
+
+pull-github:
+    stage: pull
+    before_script:
+        - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+    script:
+        - git remote remove github 2>/dev/null || true
+        - git remote add github https://github.com/qurator-spk/dinglehopper.git
+        - git remote -v
+
+        - git pull github "$CI_COMMIT_BRANCH"
+
+
+push-gitlab:
+    stage: push
+    before_script:
+        - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+    script:
+        - git push origin "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
+
+push-github:
+    stage: push
+    before_script:
+        - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+    script:
+        - git push github "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,8 +1,6 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+    rev: v5.0.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
@ -13,17 +11,37 @@ repos:
    -   id: check-ast

 -   repo: https://github.com/psf/black
-    rev: 22.10.0
+    rev: 25.1.0
    hooks:
    -   id: black

 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.280
+    rev: v0.11.7
    hooks:
-    -   id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
+    -   args:
+        -   --fix
+        -   --exit-non-zero-on-fix
+        id: ruff

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.15.0
    hooks:
-    -   id: mypy
+    -   additional_dependencies:
+        -   types-setuptools
+        -   types-lxml
+        -   numpy  # for numpy plugin
+        -   attrs
+        -   multimethod
+        -   rapidfuzz
+        id: mypy
+
+-   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
+    rev: v0.6.1
+    hooks:
+    -   id: pre-commit-update
+
+-   repo: https://github.com/dhatim/python-license-check
+    rev: 0.9.2
+    hooks:
+    -   id: liccheck
+        language: system
--- a/38
+++ b/38
@ -0,0 +1,38 @@
+ARG DOCKER_BASE_IMAGE
+FROM $DOCKER_BASE_IMAGE
+ARG VCS_REF
+ARG BUILD_DATE
+LABEL \
+    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
+    org.label-schema.vcs-ref=$VCS_REF \
+    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
+    org.label-schema.build-date=$BUILD_DATE \
+    org.opencontainers.image.vendor="Staatsbibliothek zu Berlin — SPK" \
+    org.opencontainers.image.title="dinglehopper" \
+    org.opencontainers.image.description="An OCR evaluation tool" \
+    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
+    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
+    org.opencontainers.image.revision=$VCS_REF \
+    org.opencontainers.image.created=$BUILD_DATE \
+    org.opencontainers.image.base.name=ocrd/core
+
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+# avoid HOME/.local/share (hard to predict USER here)
+# so let XDG_DATA_HOME coincide with fixed system location
+# (can still be overridden by derived stages)
+ENV XDG_DATA_HOME /usr/local/share
+# avoid the need for an extra volume for persistent resource user db
+# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
+ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
+
+WORKDIR /build/dinglehopper
+COPY . .
+COPY ocrd-tool.json .
+# prepackage ocrd-tool.json as ocrd-all-tool.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+RUN make install && rm -rf /build/dinglehopper
+
+WORKDIR /data
+VOLUME /data
--- a/2
+++ b/2
@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright 2019 qurator
+   Copyright 2019-2025 Staatsbibliothek zu Berlin — SPK

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/33
+++ b/33
@ -0,0 +1,33 @@
+PYTHON = python3
+PIP = pip3
+PYTHONIOENCODING=utf8
+PYTEST_ARGS = -vv
+
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
+DOCKER_TAG = ocrd/dinglehopper
+
+help:
+	@echo
+	@echo "  Targets"
+	@echo
+	@echo "    install Install full Python package via pip"
+	@echo "    docker  Build the ocrd/dinglehopper docker image"
+
+# Install Python package via pip
+install:
+	$(PIP) install .
+
+install-dev:
+	$(PIP) install -e .
+
+test:
+	pytest $(PYTEST_ARGS)
+
+docker:
+	docker build \
+	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
+	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
+	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
+	-t $(DOCKER_TAG) .
+
+.PHONY: help install install-dev test docker
--- a/README-DEV.md
+++ b/README-DEV.md
@ -10,6 +10,7 @@ pytest
 ```

 ## Test running examples
+
 Only unit tests:
 ```bash
 pytest -m "not integration"
@ -36,9 +37,21 @@ pytest -k "not test" --mypy
 pytest -k "not test" --ruff
 ```

-## How to use pre-commit
+# How to use pre-commit

 This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:

 - Install pre-commit, e.g. `pip install -r requirements-dev.txt`
 - Install the repo-local git hooks: `pre-commit install`
+
+
+# Releasing a new version
+
+- Update `ocrd-tool.json`
+- `git commit`
+- `git tag vx.y.z`
+- `git push && git push --tags`
+- The GitHub Actions workflow `release` will now create
+  a. a new release on GitHub and
+  b. a new release on PyPI
+- Currently requires a review for PYPI?
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ compares a ground truth (GT) document page with a OCR result page to compute
 metrics and a word/character differences report. It also supports batch processing by
 generating, aggregating and summarizing multiple reports.

-[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
+[![Tests](https://github.com/qurator-spk/dinglehopper/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
 [![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
 [![License](https://img.shields.io/badge/License-Apache-blue)](#license)
 [![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)
@ -23,10 +23,11 @@ Goals

 Installation
 ------------
-It's best to use pip, e.g.:
-~~~
-sudo pip install .
-~~~
+
+It's best to use pip to install the package from PyPI, e.g.:
+```
+pip install dinglehopper
+```

 Usage
 -----
@ -99,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.

 If you are summarizing many reports and have used the `--differences` flag while
 generating them, it may be useful to limit the number of differences reported by using
-the `--occurences-threshold` parameter. This will reduce the size of the generated HTML 
+the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML
 report, making it easier to open and navigate. Note that the JSON report will still
 contain all differences. Example:
 ~~~
-dinglehopper-summarize output_folder/ --occurences-threshold 10
+dinglehopper-summarize output_folder/ --occurrences-threshold 10
 ~~~

 ### dinglehopper-line-dirs
@ -111,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:

-~~~
+```
 dinglehopper-line-dirs gt/ ocr/
-~~~
+```
+
+The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
+directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
+in this case.

 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,9 +7,10 @@ authors = [
    {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
    {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
 ]
-description = "The OCR evaluation tool"
+description = "An OCR evaluation tool"
 readme = "README.md"
-requires-python = ">=3.6"
+license.file = "LICENSE"
+requires-python = ">=3.8"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]

 dynamic = ["version", "dependencies", "optional-dependencies"]
@ -48,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
 where = ["src"]

 [tool.setuptools.package-data]
-dinglehopper = ["*.json", "templates/*"]
+dinglehopper = ["templates/*", "*.json"]


 [tool.pytest.ini_options]
@ -60,11 +61,54 @@ markers = [


 [tool.mypy]
+plugins = ["numpy.typing.mypy_plugin"]
+
 ignore_missing_imports = true


-[tool.ruff]
+strict = true
+
+disallow_subclassing_any = false
+# ❗ error: Class cannot subclass "Processor" (has type "Any")
+disallow_any_generics = false
+disallow_untyped_defs = false
+disallow_untyped_calls = false
+
+
+[tool.ruff.lint]
 select = ["E", "F", "I"]
-ignore = [
-    "F811",  # multimethods are considered redefinitions by ruff
+
+
+[tool.liccheck]
+authorized_licenses = [
+    "bsd",
+    "new bsd",
+    "bsd license",
+    "new bsd license",
+    "simplified bsd",
+    "apache",
+    "apache 2.0",
+    "apache software license",
+    "apache software",
+    "apache license 2.0",
+    "gnu lgpl",
+    "lgpl with exceptions or zpl",
+    "GNU Library or Lesser General Public License (LGPL)",
+    "GNU Lesser General Public License v3 (LGPLv3)",
+    "GNU Lesser General Public License v2 or later (LGPLv2+)",
+    "mit",
+    "mit license",
+    "mit-cmu",
+    "python software foundation",
+    "psf",
+    "psf-2.0",
+    "Historical Permission Notice and Disclaimer (HPND)",
+    "public domain",
+    'The Unlicense (Unlicense)',
+    "isc",
+    "ISC License (ISCL)",
+    'Mozilla Public License 2.0 (MPL 2.0)',
+]
+unauthorized_licenses = [
+    "gpl v3",
 ]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,8 +1,14 @@
 pytest
 pytest-cov
-pytest-mypy
 black
 pre-commit

-ruff ; python_version >= "3.7"
-pytest-ruff ; python_version >= "3.7"
+ruff
+pytest-ruff
+
+mypy
+types-lxml
+types-setuptools
+pytest-mypy
+
+liccheck
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +1,14 @@
 click
 jinja2
 lxml
-uniseg
+uniseg >= 0.8.0
 numpy
 colorama
 MarkupSafe
-ocrd >= 2.20.1
+ocrd >= 3.3.0
 attrs
-multimethod == 1.3  # latest version to officially support Python 3.5
+multimethod >= 1.3
 tqdm
-rapidfuzz >= 2.4.2
-six  # XXX workaround OCR-D/core#730
+rapidfuzz >= 2.7.0
 chardet
+importlib_resources
--- a/src/dinglehopper/init.py
+++ b/src/dinglehopper/init.py
@ -1,4 +1,4 @@
-from .align import align, seq_align
+from .align import align, score_hint, seq_align
 from .character_error_rate import character_error_rate, character_error_rate_n
 from .edit_distance import distance, editops
 from .extracted_text import ExtractedText
@ -16,6 +16,7 @@ __all__ = [
    "editops",
    "distance",
    "align",
+    "score_hint",
    "seq_align",
    "character_error_rate",
    "character_error_rate_n",
--- a/src/dinglehopper/align.py
+++ b/src/dinglehopper/align.py
@ -1,8 +1,10 @@
+import math
 import unicodedata
+from math import ceil
+from typing import Optional

 from rapidfuzz.distance import Levenshtein
-
-from .edit_distance import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters


 def align(t1, t2):
@ -12,11 +14,27 @@ def align(t1, t2):
    return seq_align(s1, s2)


-def seq_align(s1, s2):
+def score_hint(er: float, n: int) -> Optional[int]:
+    """Calculate RapidFuzz score hint for a given error rate and count.
+
+    Gives the score hint for the distance functions (= expected distance) or None if
+    the error rate is inf.
+    """
+    assert not math.isnan(er)
+    try:
+        score_hint = int(ceil(er * n))
+    except (OverflowError, ValueError):
+        # ceil(er * n) can be inf or NaN (for n == 0), so int() can throw an
+        # OverflowError and a ValueError.
+        score_hint = None
+    return score_hint
+
+
+def seq_align(s1, s2, score_hint=None):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
-    ops = Levenshtein.editops(s1, s2)
+    ops = Levenshtein.editops(s1, s2, score_hint=score_hint)
    i = 0
    j = 0

--- a/src/dinglehopper/character_error_rate.py
+++ b/src/dinglehopper/character_error_rate.py
@ -1,7 +1,5 @@
-from __future__ import division
-
 import unicodedata
-from typing import Tuple
+from typing import List, Tuple, TypeVar

 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
@ -9,9 +7,13 @@ from uniseg.graphemecluster import grapheme_clusters
 from .edit_distance import distance
 from .extracted_text import ExtractedText

+T = TypeVar("T")
+

@multimethod
-def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
+def character_error_rate_n(
+    reference: List[str], compared: List[str]
+) -> Tuple[float, int]:
    """
    Compute character error rate.

@ -19,7 +21,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
    """

    d = distance(reference, compared)
-    n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
+    n = len(reference)

    if d == 0:
        return 0, n
@ -30,18 +32,28 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
    # XXX Should we really count newlines here?


-@multimethod
-def character_error_rate_n(
-    reference: ExtractedText, compared: ExtractedText
-) -> Tuple[float, int]:
-    return character_error_rate_n(reference.text, compared.text)
+@character_error_rate_n.register
+def _(reference: str, compared: str) -> Tuple[float, int]:
+    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", reference)))
+    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", compared)))
+    cer, n = character_error_rate_n(seq1, seq2)
+    return cer, n
+
+
+@character_error_rate_n.register
+def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
+    cer, n = character_error_rate_n(
+        reference.grapheme_clusters, compared.grapheme_clusters
+    )
+    return cer, n


-def character_error_rate(reference, compared) -> float:
+def character_error_rate(reference: T, compared: T) -> float:
    """
    Compute character error rate.

    :return: character error rate
    """
+    cer: float
    cer, _ = character_error_rate_n(reference, compared)
    return cer
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@ -1,13 +1,13 @@
 import os
 from collections import Counter
+from typing import List

 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from ocrd_utils import initLogging
-from uniseg.graphemecluster import grapheme_clusters

-from dinglehopper.align import seq_align
+from dinglehopper.align import score_hint, seq_align
 from dinglehopper.character_error_rate import character_error_rate_n
 from dinglehopper.config import Config
 from dinglehopper.extracted_text import ExtractedText
@ -15,7 +15,9 @@ from dinglehopper.ocr_files import extract
 from dinglehopper.word_error_rate import word_error_rate_n, words_normalized


-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
+def gen_diff_report(
+    gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
+):
    gtx = ""
    ocrx = ""

@ -42,9 +44,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
    if isinstance(gt_in, ExtractedText):
        if not isinstance(ocr_in, ExtractedText):
            raise TypeError()
-        # XXX splitting should be done in ExtractedText
-        gt_things = list(grapheme_clusters(gt_in.text))
-        ocr_things = list(grapheme_clusters(ocr_in.text))
+        gt_things = gt_in.grapheme_clusters
+        ocr_things = ocr_in.grapheme_clusters
    else:
        gt_things = gt_in
        ocr_things = ocr_in
@ -53,7 +54,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
    o_pos = 0
    found_differences = []

-    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
        css_classes = None
        gt_id = None
        ocr_id = None
@ -76,7 +77,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
        if o is not None:
            o_pos += len(o)

-    found_differences = dict(Counter(elem for elem in found_differences))
+    counted_differences = dict(Counter(elem for elem in found_differences))

    return (
        """
@ -87,7 +88,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
        """.format(
            gtx, ocrx
        ),
-        found_differences,
+        counted_differences,
    )


@ -105,39 +106,56 @@ def json_float(value):


 def process(
-    gt,
-    ocr,
-    report_prefix,
-    reports_folder=".",
+    gt: str,
+    ocr: str,
+    report_prefix: str,
+    reports_folder: str = ".",
    *,
-    metrics=True,
-    differences=False,
-    textequiv_level="region",
-):
+    metrics: bool = True,
+    differences: bool = False,
+    textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
+) -> None:
    """Check OCR result against GT.

    The @click decorators change the signature of the decorated functions, so we keep
    this undecorated version and use Click on a wrapper.
    """

-    gt_text = extract(gt, textequiv_level=textequiv_level)
-    ocr_text = extract(ocr, textequiv_level=textequiv_level)
+    gt_text = extract(
+        gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
+    ocr_text = extract(
+        ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
+    gt_words: List[str] = list(words_normalized(gt_text))
+    ocr_words: List[str] = list(words_normalized(ocr_text))

+    assert isinstance(gt_text, ExtractedText)
+    assert isinstance(ocr_text, ExtractedText)
    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    wer, n_words = word_error_rate_n(gt_text, ocr_text)
-
    char_diff_report, diff_c = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·", differences=differences
+        gt_text,
+        ocr_text,
+        css_prefix="c",
+        joiner="",
+        none="·",
+        score_hint=score_hint(cer, n_characters),
+        differences=differences,
    )

-    gt_words = words_normalized(gt_text)
-    ocr_words = words_normalized(ocr_text)
+    # {gt,ocr}_words must not be a generator, so we don't drain it for the differences
+    # report.
+    assert isinstance(gt_words, list)
+    assert isinstance(ocr_words, list)
+    wer, n_words = word_error_rate_n(gt_words, ocr_words)
    word_diff_report, diff_w = gen_diff_report(
        gt_words,
        ocr_words,
        css_prefix="w",
        joiner=" ",
        none="⋯",
+        score_hint=score_hint(wer, n_words),
        differences=differences,
    )

@ -174,8 +192,16 @@ def process(


 def process_dir(
-    gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
-):
+    gt: str,
+    ocr: str,
+    report_prefix: str,
+    reports_folder: str = ".",
+    *,
+    metrics: bool = True,
+    differences: bool = False,
+    textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
+) -> None:
    for gt_file in os.listdir(gt):
        gt_file_path = os.path.join(gt, gt_file)
        ocr_file_path = os.path.join(ocr, gt_file)
@ -189,6 +215,7 @@ def process_dir(
                metrics=metrics,
                differences=differences,
                textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
            )
        else:
            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@ -213,7 +240,13 @@ def process_dir(
    help="PAGE TextEquiv level to extract text from",
    metavar="LEVEL",
 )
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
+@click.version_option()
 def main(
    gt,
    ocr,
@ -222,6 +255,7 @@ def main(
    metrics,
    differences,
    textequiv_level,
+    plain_encoding,
    progress,
 ):
    """
@ -256,9 +290,10 @@ def main(
                ocr,
                report_prefix,
                reports_folder,
-                metrics,
-                differences,
-                textequiv_level,
+                metrics=metrics,
+                differences=differences,
+                textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
            )
    else:
        process(
@ -269,6 +304,7 @@ def main(
            metrics=metrics,
            differences=differences,
            textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
        )


--- a/src/dinglehopper/cli_extract.py
+++ b/src/dinglehopper/cli_extract.py
@ -12,7 +12,12 @@ from .ocr_files import extract
    help="PAGE TextEquiv level to extract text from",
    metavar="LEVEL",
 )
-def main(input_file, textequiv_level):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(input_file, textequiv_level, plain_encoding):
    """
    Extract the text of the given INPUT_FILE.

@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    initLogging()
-    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    input_text = extract(
+        input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    ).text
    print(input_text)


--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@ -1,16 +1,53 @@
 import itertools
 import os
+from typing import Callable, Iterator, List, Optional, Tuple

 import click
 from jinja2 import Environment, FileSystemLoader
 from ocrd_utils import initLogging

+from .align import score_hint
 from .character_error_rate import character_error_rate_n
 from .cli import gen_diff_report, json_float
 from .ocr_files import plain_extract
 from .word_error_rate import word_error_rate_n, words_normalized


+def removesuffix(text, suffix):
+    """
+    Remove suffix from text.
+
+    Can be replaced with str.removesuffix when we only support Python >= 3.9.
+    """
+    if suffix and text.endswith(suffix):
+        return text[: -len(suffix)]
+    return text
+
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+
+def find_all_files(
+    dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
+) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
 def all_equal(iterable):
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
@ -24,15 +61,63 @@ def common_suffix(its):
    return reversed(common_prefix(reversed(it) for it in its))


-def removesuffix(text, suffix):
-    if suffix and text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text
+def find_gt_and_ocr_files(
+    gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
+) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
+
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn
+
+
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(
+            f"Files in GT directory {gt_dir} do not have a common suffix"
+        )
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(
+            f"Files in OCR directory {ocr_dir} do not have a common suffix"
+        )

+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)

-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
-    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
-    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
+
+def process(
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
+):

    cer = None
    n_characters = None
@ -41,14 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
    n_words = None
    word_diff_report = ""

-    for k, gt in enumerate(os.listdir(gt_dir)):
-        # Find a match by replacing the suffix
-        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+    if gt_suffix is not None and ocr_suffix is not None:
+        gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+    else:
+        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)

-        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
+    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
        ocr_text = plain_extract(
-            os.path.join(ocr_dir, ocr), include_filename_in_id=True
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
        )
+        gt_words: List[str] = list(words_normalized(gt_text))
+        ocr_words: List[str] = list(words_normalized(ocr_text))

        # Compute CER
        l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -62,7 +153,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
            n_characters = n_characters + l_n_characters

        # Compute WER
-        l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
+        l_wer, l_n_words = word_error_rate_n(gt_words, ocr_words)
        if wer is None:
            wer, n_words = l_wer, l_n_words
        else:
@ -72,13 +163,21 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):

        # Generate diff reports
        char_diff_report += gen_diff_report(
-            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
-        )
-        gt_words = words_normalized(gt_text)
-        ocr_words = words_normalized(ocr_text)
+            gt_text,
+            ocr_text,
+            css_prefix="l{0}-c".format(k),
+            joiner="",
+            none="·",
+            score_hint=score_hint(l_cer, l_n_characters),
+        )[0]
        word_diff_report += gen_diff_report(
-            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
-        )
+            gt_words,
+            ocr_words,
+            css_prefix="l{0}-w".format(k),
+            joiner=" ",
+            none="⋯",
+            score_hint=score_hint(l_wer, l_n_words),
+        )[0]

    env = Environment(
        loader=FileSystemLoader(
@ -112,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
-def main(gt, ocr, report_prefix, metrics):
+@click.option("--gt-suffix", help="Suffix of GT line text files")
+@click.option("--ocr-suffix", help="Suffix of OCR line text files")
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
    """
    Compare the GT line text directory against the OCR line text directory.

    This assumes that the GT line text directory contains textfiles with a common
    suffix like ".gt.txt", and the OCR line text directory contains textfiles with
    a common suffix like ".some-ocr.txt". The text files also need to be paired,
-    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
-    in the OCT lines directory.
+    i.e. the GT filename "line001.gt.txt" needs to match a filename
+    "line001.some-ocr.txt" in the OCR lines directory.
+
+    GT and OCR directories may contain line text files in matching subdirectories,
+    e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
+
+    GT and OCR directories can also be the same directory, but in this case you need
+    to give --gt-suffix and --ocr-suffix explicitly.

-    The GT and OCR directories are usually round truth line texts and the results of
+    The GT and OCR directories are usually ground truth line texts and the results of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
@ -131,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
    """
    initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics)
+    process(
+        gt,
+        ocr,
+        report_prefix,
+        metrics=metrics,
+        gt_suffix=gt_suffix,
+        ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
+    )


 if __name__ == "__main__":
--- a/src/dinglehopper/cli_summarize.py
+++ b/src/dinglehopper/cli_summarize.py
@ -1,5 +1,6 @@
 import json
 import os
+from typing import Dict

 import click
 from jinja2 import Environment, FileSystemLoader
@ -13,8 +14,8 @@ def process(reports_folder, occurrences_threshold=1):
    wer_list = []
    cer_sum = 0
    wer_sum = 0
-    diff_c = {}
-    diff_w = {}
+    diff_c: Dict[str, int] = {}
+    diff_w: Dict[str, int] = {}

    for report in os.listdir(reports_folder):
        if report.endswith(".json"):
@ -34,10 +35,15 @@ def process(reports_folder, occurrences_threshold=1):
                cer_sum += cer
                wer_sum += wer

-                for key, value in report_data["differences"]["character_level"].items():
+                try:
+                    for key, value in report_data["differences"][
+                        "character_level"
+                    ].items():
                        diff_c[key] = diff_c.get(key, 0) + value
                    for key, value in report_data["differences"]["word_level"].items():
                        diff_w[key] = diff_w.get(key, 0) + value
+                except KeyError:
+                    pass

    if len(cer_list) == 0:
        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@ -1,6 +1,5 @@
-from __future__ import division, print_function
-
 import unicodedata
+from typing import List

 from multimethod import multimethod
 from rapidfuzz.distance import Levenshtein
@ -10,7 +9,18 @@ from .extracted_text import ExtractedText


@multimethod
-def distance(s1: str, s2: str):
+def distance(seq1: List[str], seq2: List[str]) -> int:
+    """Compute the Levenshtein edit distance between two lists of grapheme clusters.
+
+    This assumes that the grapheme clusters are already normalized.
+
+    Use distance(str, str) instead if you need to compare two Unicode strings.
+    """
+    return Levenshtein.distance(seq1, seq2)
+
+
+@distance.register
+def _(s1: str, s2: str) -> int:
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode
@ -22,9 +32,9 @@ def distance(s1: str, s2: str):
    return Levenshtein.distance(seq1, seq2)


-@multimethod
-def distance(s1: ExtractedText, s2: ExtractedText):
-    return distance(s1.text, s2.text)
+@distance.register
+def _(s1: ExtractedText, s2: ExtractedText) -> int:
+    return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)


 def editops(word1, word2):
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@ -1,14 +1,16 @@
 import enum
+import functools
 import re
 import unicodedata
 from contextlib import suppress
 from itertools import repeat
-from typing import Optional
+from typing import Any, Dict, List, Optional

 import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
+from uniseg.graphemecluster import grapheme_clusters


 class Normalization(enum.Enum):
@ -120,7 +122,7 @@ class ExtractedText:
    segment_id = attr.ib(type=Optional[str])

    @segment_id.validator
-    def check(self, _, value):
+    def is_valid_segment_id(self, _, value):
        if value is None:
            return
        if not re.match(r"[\w\d_-]+", value):
@ -130,33 +132,85 @@ class ExtractedText:
    # a. _text itself
    # b. or segments (ExtractedText) and a joiner

-    segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
+    segments = attr.ib(type=Optional[List["ExtractedText"]])
    joiner = attr.ib(type=Optional[str])
    _text = attr.ib(type=Optional[str])
+    _grapheme_clusters = attr.ib(type=Optional[List[str]])

    @segments.validator
-    def check(self, _, value):
+    def cant_set_both_segments_and_text(self, _, value):
        if value is not None and self._text is not None:
            raise ValueError("Can't have both segments and text")

+    @joiner.validator
+    def is_valid_joiner(self, _, value):
+        if self.segments is None:
+            if value is not None:
+                raise ValueError("Can't have joiner without segments to join")
+        if self.segments is not None:
+            if value not in ("", " ", "\n"):
+                raise ValueError(f"Unexpected segment joiner value {repr(value)}")
+
    @_text.validator
-    def check(self, _, value):
-        if value is not None and self.segments is not None:
+    def is_valid_text(self, _, value):
+        if value is None:
+            return
+
+        if self.segments is not None:
            raise ValueError("Can't have both segments and text")
-        if value is not None and unicodedata.normalize("NFC", value) != value:
+        if unicodedata.normalize("NFC", value) != value:
            raise ValueError('String "{}" is not in NFC.'.format(value))
-        if value is not None and normalize(value, self.normalization) != value:
+        if normalize(value, self.normalization) != value:
            raise ValueError('String "{}" is not normalized.'.format(value))
+        if self._grapheme_clusters is None:
+            raise ValueError("Requires both text and grapheme clusters to be set")
+
+    @_grapheme_clusters.validator
+    def are_valid_grapheme_clusters(self, _, value):
+        if value is not None and self._text is None:
+            raise ValueError("Requires both text and grapheme clusters to be set")

    normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)

    @property
-    def text(self):
+    def text(self) -> str:
        if self._text is not None:
            return self._text
        else:
+            assert self.joiner is not None and self.segments is not None
            return self.joiner.join(s.text for s in self.segments)

+    @functools.cached_property
+    def _joiner_grapheme_cluster(self):
+        """We need the joiner as a list of 0 or 1 grapheme clusters.
+
+        This property is cached.
+        """
+
+        assert self.joiner is not None
+        if len(self.joiner) > 0:
+            joiner_grapheme_cluster = list(grapheme_clusters(self.joiner))
+            assert len(joiner_grapheme_cluster) == 1  # see joiner's check above
+        elif len(self.joiner) == 0:
+            joiner_grapheme_cluster = []
+        else:
+            joiner_grapheme_cluster = None
+
+        return joiner_grapheme_cluster
+
+    @property
+    def grapheme_clusters(self):
+        if self._text is not None:
+            return self._grapheme_clusters
+        else:
+            # TODO Test with text extracted at glyph level (joiner == "")
+            clusters = []
+            assert self.segments is not None
+            for seg in self.segments:
+                clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster
+            clusters = clusters[:-1]
+            return clusters
+
    _segment_id_for_pos = None

    def segment_id_for_pos(self, pos):
@ -167,6 +221,7 @@ class ExtractedText:
            else:
                # Recurse
                segment_id_for_pos = []
+                assert self.joiner is not None and self.segments is not None
                for s in self.segments:
                    seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
                    segment_id_for_pos.extend(seg_ids)
@ -180,7 +235,7 @@ class ExtractedText:
        return self._segment_id_for_pos[pos]

    @classmethod
-    def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
+    def from_text_segment(cls, text_segment, nsmap, *, textequiv_level="region"):
        """Build an ExtractedText from a PAGE content text element"""

        localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
@ -197,7 +252,8 @@ class ExtractedText:
                # FIXME hardcoded SBB normalization
                segment_text = normalize_sbb(segment_text)
            segment_text = segment_text or ""
-            return cls(segment_id, None, None, segment_text)
+            clusters = list(grapheme_clusters(segment_text))
+            return cls(segment_id, None, None, segment_text, clusters)
        else:
            # Recurse
            sub_localname = children_for_localname[localname]
@ -212,12 +268,15 @@ class ExtractedText:
                    )
                )
            joiner = joiner_for_textequiv_level[sub_textequiv_level]
-            return cls(segment_id, segments, joiner, None)
+            return cls(segment_id, segments, joiner, None, None)

    @classmethod
    def from_str(cls, text, normalization=Normalization.NFC_SBB):
        normalized_text = normalize(text, normalization)
-        return cls(None, None, None, normalized_text, normalization=normalization)
+        clusters = list(grapheme_clusters(normalized_text))
+        return cls(
+            None, None, None, normalized_text, clusters, normalization=normalization
+        )


 def invert_dict(d):
@ -225,7 +284,7 @@ def invert_dict(d):
    return {v: k for k, v in d.items()}


-def get_textequiv_unicode(text_segment, nsmap) -> str:
+def get_textequiv_unicode(text_segment: Any, nsmap: Dict[str, str]) -> str:
    """Get the TextEquiv/Unicode text of the given PAGE text element."""
    segment_id = text_segment.attrib["id"]
    textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
@ -249,7 +308,7 @@ def get_first_textequiv(textequivs, segment_id):
    if np.any(~nan_mask):
        if np.any(nan_mask):
            log.warning("TextEquiv without index in %s.", segment_id)
-        index = np.nanargmin(indices)
+        index = int(np.nanargmin(indices))
    else:
        # try ordering by conf
        confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
@ -258,7 +317,7 @@ def get_first_textequiv(textequivs, segment_id):
                "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
                segment_id,
            )
-            index = np.nanargmax(confidences)
+            index = int(np.nanargmax(confidences))
        else:
            # fallback to first entry in case of neither index or conf present
            log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
@ -266,11 +325,11 @@ def get_first_textequiv(textequivs, segment_id):
    return textequivs[index]


-def get_attr(te, attr_name) -> float:
+def get_attr(te: Any, attr_name: str) -> float:
    """Extract the attribute for the given name.

    Note: currently only handles numeric values!
-    Other or non existend values are encoded as np.nan.
+    Other or non existent values are encoded as np.nan.
    """
    attr_value = te.attrib.get(attr_name)
    try:
--- a/src/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/src/dinglehopper/notebooks/Levenshtein.ipynb
@ -22,7 +22,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
+    "dinglehopper used to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
   ]
  },
  {
@ -391,7 +391,7 @@
    "\\text{CER} = \\frac{i + s + d}{n}\n",
    "$$\n",
    "\n",
-    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
+    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)"
   ]
  },
  {
@ -680,7 +680,7 @@
      "        return cat in unwanted_categories or subcat in unwanted_subcategories\n",
      "\n",
      "    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
-      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
+      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n",
      "    for word in uniseg.wordbreak.words(s):\n",
      "        if all(unwanted(c) for c in word):\n",
      "            pass\n",
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@ -1,44 +1,56 @@
-from __future__ import division, print_function
-
 import os
 import sys
-from typing import Iterator
+from typing import Dict, Iterator, Optional

 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from ocrd_utils import getLogger
+from uniseg.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText, normalize_sbb

+log = getLogger("processor.OcrdDinglehopperEvaluate")
+

-def alto_namespace(tree: ET.ElementTree) -> str:
+def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
    """Return the ALTO namespace used in the given ElementTree.

    This relies on the assumption that, in any given ALTO file, the root element has the
-    local name "alto". We do not check if the files uses any valid ALTO namespace.
+    local name "alto". We do not check if the file uses any valid ALTO namespace.
    """
    root_name = ET.QName(tree.getroot().tag)
    if root_name.localname == "alto":
+        assert isinstance(root_name.namespace, str)
        return root_name.namespace
    else:
        raise ValueError("Not an ALTO tree")


-def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]:
-    nsmap = {"alto": alto_namespace(tree)}
+def alto_nsmap(tree: ET._ElementTree) -> Dict[str, str]:
+    alto_ns = alto_namespace(tree)
+    if alto_ns is None:
+        raise ValueError("Could not determine ALTO namespace")
+    return {"alto": alto_ns}
+
+
+def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
+    nsmap = alto_nsmap(tree)
    for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
        line_id = line.attrib.get("ID")
        line_text = " ".join(
-            string.attrib.get("CONTENT")
+            string.attrib.get("CONTENT", "")
            for string in line.iterfind("alto:String", namespaces=nsmap)
        )
-        yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
+        normalized_text = normalize_sbb(line_text)
+        clusters = list(grapheme_clusters(normalized_text))
+        yield ExtractedText(line_id, None, None, normalized_text, clusters)
        # FIXME hardcoded SBB normalization


-def alto_extract(tree: ET.ElementTree) -> ExtractedText:
+def alto_extract(tree: ET._ElementTree) -> ExtractedText:
    """Extract text from the given ALTO ElementTree."""
-    return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
+    return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None, None)


 def alto_text(tree):
@ -87,7 +99,7 @@ def page_extract(tree, *, textequiv_level="region"):
    # Filter empty region texts
    regions = [r for r in regions if r.text != ""]

-    return ExtractedText(None, regions, "\n", None)
+    return ExtractedText(None, regions, "\n", None, None)


 def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
@ -97,7 +109,7 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
    if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
        ro_children = list(group)

-        ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
+        ro_children = [child for child in ro_children if "index" in child.attrib.keys()]
        ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
    elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]:
        ro_children = list(group)
@ -140,33 +152,44 @@ def detect_encoding(filename):
    return chardet.detect(open(filename, "rb").read(1024))["encoding"]


-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"

-    fileencoding = detect_encoding(filename)
-    with open(filename, "r", encoding=fileencoding) as f:
+    def make_segment(no, line):
+        normalized_text = normalize_sbb(line)
+        clusters = list(grapheme_clusters(normalized_text))
        return ExtractedText(
-            None,
-            [
-                ExtractedText(
            id_template.format(filename=os.path.basename(filename), no=no),
            None,
            None,
-                    normalize_sbb(line),
+            normalized_text,
+            clusters,
        )
-                for no, line in enumerate(f.readlines())
-            ],
+
+    if encoding == "autodetect":
+        fileencoding = detect_encoding(filename)
+        log.warning(
+            f"Autodetected encoding as '{fileencoding}'"
+            ", it is recommended to specify it explicitly with --plain-encoding"
+        )
+    else:
+        fileencoding = encoding
+    with open(filename, "r", encoding=fileencoding) as f:
+        return ExtractedText(
+            None,
+            [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
            "\n",
            None,
+            None,
        )
    # XXX hardcoded SBB normalization


-def plain_text(filename):
-    return plain_extract(filename).text
+def plain_text(filename, encoding="autodetect"):
+    return plain_extract(filename, encoding=encoding).text


-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
    """Extract the text from the given file.

    Supports PAGE, ALTO and falls back to plain text.
@ -174,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
    try:
        tree = ET.parse(filename)
    except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
    except ValueError:
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@ -1,17 +1,13 @@
 {
-  "version": "0.9.1",
+  "version": "0.11.0",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
+  "dockerhub": "ocrd/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
      "executable": "ocrd-dinglehopper",
+      "input_file_grp_cardinality": 2,
+      "output_file_grp_cardinality": 1,
      "description": "Evaluate OCR text against ground truth with dinglehopper",
-      "input_file_grp": [
-        "OCR-D-GT-PAGE",
-        "OCR-D-OCR"
-      ],
-      "output_file_grp": [
-        "OCR-D-OCR-EVAL"
-      ],
      "categories": [
        "Quality assurance"
      ],
@ -29,6 +25,11 @@
          "enum": ["region", "line"],
          "default": "region",
          "description": "PAGE XML hierarchy level to extract the text from"
+        },
+        "plain_encoding": {
+          "type": "string",
+          "default": "autodetect",
+          "description": "Encoding (e.g. \"utf-8\") of plain text files"
        }
      }
    }
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@ -1,63 +1,59 @@
-import json
+from functools import cached_property
 import os
+from typing import Optional

 import click
+from ocrd_models import OcrdFileType
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
-from pkg_resources import resource_string
+from ocrd_utils import make_file_id

 from .cli import process as cli_process

-OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
-
-
@click.command()
@ocrd_cli_options
 def ocrd_dinglehopper(*args, **kwargs):
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)

-
 class OcrdDinglehopperEvaluate(Processor):
-    def __init__(self, *args, **kwargs):
-        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
-        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)

-    def process(self):
-        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+    @cached_property
+    def executable(self):
+        return 'ocrd-dinglehopper'

-        log = getLogger("processor.OcrdDinglehopperEvaluate")
+    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:

+        assert self.parameter
        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
-        gt_grp, ocr_grp = self.input_file_grp.split(",")
+        plain_encoding = self.parameter["plain_encoding"]

-        input_file_tuples = self.zip_input_files(on_error="abort")
-        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
+        # wrong number of inputs: let fail
+        gt_file, ocr_file = input_files
+        # missing on either side: skip (zip_input_files already warned)
        if not gt_file or not ocr_file:
-                # file/page was not found in this group
-                continue
-            gt_file = self.workspace.download_file(gt_file)
-            ocr_file = self.workspace.download_file(ocr_file)
-            page_id = gt_file.pageId
+            return
+        # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
+        if not gt_file.local_filename:
+            if config.OCRD_MISSING_INPUT == 'ABORT':
+                raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
+            return
+        if not ocr_file.local_filename:
+            if config.OCRD_MISSING_INPUT == 'ABORT':
+                raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
+            return

-            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
+        page_id = gt_file.pageId

        file_id = make_file_id(ocr_file, self.output_file_grp)
-            report_prefix = os.path.join(self.output_file_grp, file_id)
-
-            # Process the files
-            try:
-                os.mkdir(self.output_file_grp)
-            except FileExistsError:
-                pass
        cli_process(
            gt_file.local_filename,
            ocr_file.local_filename,
-                report_prefix,
+            file_id,
+            self.output_file_grp,
            metrics=metrics,
            textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
        )

        # Add reports to the workspace
@ -65,12 +61,16 @@ class OcrdDinglehopperEvaluate(Processor):
            [".html", "text/html"],
            [".json", "application/json"],
        ]:
+            output_file_id = file_id + report_suffix
+            output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
+            if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
+                raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
            self.workspace.add_file(
-                    file_id=file_id + report_suffix,
+               file_id=output_file_id,
                file_grp=self.output_file_grp,
                page_id=page_id,
                mimetype=mimetype,
-                    local_filename=report_prefix + report_suffix,
+                local_filename=file_id + report_suffix,
            )


--- a/src/dinglehopper/tests/data/actevedef_718448162/mets.xml
+++ b/src/dinglehopper/tests/data/actevedef_718448162/mets.xml
@ -138,17 +138,17 @@
  <mets:fileSec>
    <mets:fileGrp USE="OCR-D-GT-PAGE">
      <mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
      </mets:file>
    </mets:fileGrp>
    <mets:fileGrp USE="OCR-D-OCR-CALAMARI">
      <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
      </mets:file>
    </mets:fileGrp>
    <mets:fileGrp USE="OCR-D-OCR-TESS">
      <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
      </mets:file>
    </mets:fileGrp>
  </mets:fileSec>
--- a/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
@ -0,0 +1 @@
+This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
@ -0,0 +1 @@
+Another test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
@ -0,0 +1 @@
+Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
@ -0,0 +1 @@
+AnÖther test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
@ -0,0 +1 @@
+This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
@ -0,0 +1 @@
+Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
@ -0,0 +1 @@
+Another test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
@ -0,0 +1 @@
+AnÖther test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
@ -0,0 +1 @@
+This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
@ -0,0 +1 @@
+Another test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
@ -0,0 +1 @@
+Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
@ -0,0 +1 @@
+AnÖther test.
--- a/src/dinglehopper/tests/extracted_text_test.py
+++ b/src/dinglehopper/tests/extracted_text_test.py
@ -13,12 +13,13 @@ def test_text():
    test1 = ExtractedText(
        None,
        [
-            ExtractedText("s0", None, None, "foo"),
-            ExtractedText("s1", None, None, "bar"),
-            ExtractedText("s2", None, None, "bazinga"),
+            ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
+            ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
+            ExtractedText("s2", None, None, "bazinga", grapheme_clusters("bazinga")),
        ],
        " ",
        None,
+        None,
    )

    assert test1.text == "foo bar bazinga"
@ -29,8 +30,20 @@ def test_text():

 def test_normalization_check():
    with pytest.raises(ValueError, match=r".*is not in NFC.*"):
-        ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
-    assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
+        ExtractedText(
+            "foo",
+            None,
+            None,
+            unicodedata.normalize("NFD", "Schlyñ"),
+            grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
+        )
+    assert ExtractedText(
+        "foo",
+        None,
+        None,
+        unicodedata.normalize("NFC", "Schlyñ"),
+        grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
+    )


 AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
@ -47,25 +60,27 @@ def test_align():
    test1 = ExtractedText(
        None,
        [
-            ExtractedText("s0", None, None, "foo"),
-            ExtractedText("s1", None, None, "bar"),
-            ExtractedText("s2", None, None, "batzinga"),
+            ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
+            ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
+            ExtractedText("s2", None, None, "batzinga", grapheme_clusters("batzinga")),
        ],
        " ",
        None,
+        None,
    )
    test2 = ExtractedText(
        None,
        [
-            ExtractedText("x0", None, None, "foo"),
-            ExtractedText("x1", None, None, "bar"),
+            ExtractedText("x0", None, None, "foo", grapheme_clusters("foo")),
+            ExtractedText("x1", None, None, "bar", grapheme_clusters("bar")),
            # extra .
-            ExtractedText("x2", None, None, "."),
+            ExtractedText("x2", None, None, ".", grapheme_clusters(".")),
            # deletion + different grapheme cluster, m̃ also is two Python characters
-            ExtractedText("x3", None, None, "bazim̃ga"),
+            ExtractedText("x3", None, None, "bazim̃ga", grapheme_clusters("bazim̃ga")),
        ],
        " ",
        None,
+        None,
    )

    left_pos = 0
--- a/src/dinglehopper/tests/test_align.py
+++ b/src/dinglehopper/tests/test_align.py
@ -1,6 +1,8 @@
+import math
+
 import pytest

-from .. import align, distance, seq_align
+from .. import align, distance, score_hint, seq_align
 from .util import unzip


@ -183,3 +185,8 @@ def test_lines_similar():

    # Test __eq__ (i.e. is it a substitution or a similar string?)
    assert list(left)[0] == list(right)[0]
+
+
+def test_score_hint():
+    assert score_hint(0.5, 23) == 12  # int(ceil())
+    assert score_hint(math.inf, 12345) is None
--- a/src/dinglehopper/tests/test_integ_cli_dir.py
+++ b/src/dinglehopper/tests/test_integ_cli_dir.py
@ -21,9 +21,9 @@ def test_cli_directory(tmp_path):
        os.path.join(data_dir, "directory-test", "ocr"),
        "report",
        str(tmp_path / "reports"),
-        False,
-        True,
-        "line",
+        metrics=False,
+        differences=True,
+        textequiv_level="line",
    )

    assert os.path.exists(tmp_path / "reports/1.xml-report.json")
@ -45,9 +45,9 @@ def test_cli_fail_without_gt(tmp_path):
        os.path.join(data_dir, "directory-test", "ocr"),
        "report",
        str(tmp_path / "reports"),
-        False,
-        True,
-        "line",
+        metrics=False,
+        differences=True,
+        textequiv_level="line",
    )

    assert len(os.listdir(tmp_path / "reports")) == 2 * 2
--- a/src/dinglehopper/tests/test_integ_cli_line_dirs.py
+++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
@ -0,0 +1,61 @@
+import json
+import os.path
+import re
+
+import pytest
+
+from ..cli_line_dirs import process
+from .util import working_directory
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic_report_diff(tmp_path):
+    """Test that the cli/process() produces a report wiff char+word diff"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+
+    # Counting GT lines in the diff
+    assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2
+    assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_merged(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/merged")
+        ocr_dir = os.path.join(data_dir, "line_dirs/merged")
+        process(
+            gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt"
+        )
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)
--- a/src/dinglehopper/tests/test_integ_cli_valid_report.py
+++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py
@ -1,4 +1,5 @@
 import json
+import re

 import pytest

@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path):
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            assert j["cer"] == pytest.approx(float("inf"))
+
+
+@pytest.mark.integration
+def test_cli_html(tmp_path):
+    """Test that the cli/process() yields complete HTML report"""
+
+    with working_directory(tmp_path):
+        with open("gt.txt", "w") as gtf:
+            gtf.write("AAAAA")
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("AAAAB")
+
+        process("gt.txt", "ocr.txt", "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+            print(html_report)
+
+        assert re.search(r"CER: 0\.\d+", html_report)
+        assert re.search(r"WER: 1\.0", html_report)
+        assert len(re.findall("gt.*cdiff", html_report)) == 1
+        assert len(re.findall("gt.*wdiff", html_report)) == 1
--- a/src/dinglehopper/tests/test_integ_empty_files.py
+++ b/src/dinglehopper/tests/test_integ_empty_files.py
@ -0,0 +1,35 @@
+from __future__ import division, print_function
+
+import math
+
+import pytest
+
+from .. import character_error_rate, plain_text
+from .util import working_directory
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "gt_file_content,ocr_file_content,cer_expected",
+    [
+        ("", "Lorem ipsum", math.inf),
+        ("Lorem ipsum", "", 1.0),
+        ("\ufeff", "Lorem ipsum", math.inf),
+        ("Lorem ipsum", "\ufeff", 1.0),
+        ("", "", 0.0),
+        ("\ufeff", "", 0.0),
+        ("", "\ufeff", 0.0),
+    ],
+)
+def test_empty_files(tmp_path, gt_file_content, ocr_file_content, cer_expected):
+    with working_directory(tmp_path):
+
+        with open("gt.txt", "w") as gtf:
+            gtf.write(gt_file_content)
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write(ocr_file_content)
+
+        gt_text = plain_text("gt.txt")
+        ocr_text = plain_text("ocr.txt")
+
+        assert character_error_rate(gt_text, ocr_text) == cer_expected
--- a/src/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/src/dinglehopper/tests/test_integ_ocrd_cli.py
@ -34,9 +34,8 @@ def test_ocrd_cli(tmp_path):
            "-O",
            "OCR-D-OCR-CALAMARI-EVAL",
        ]
-        sys.argv[
-            1:
-        ] = args  # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+        # Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+        sys.argv[1:] = args
        result = runner.invoke(ocrd_dinglehopper, args)
    assert result.exit_code == 0
    result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))
--- a/src/dinglehopper/tests/test_line_dirs.py
+++ b/src/dinglehopper/tests/test_line_dirs.py
@ -0,0 +1,71 @@
+import os
+
+from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+def test_basic():
+    """Test the dumb method: User gives directories and suffixes."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            os.path.join(data_dir, "line_dirs/basic/gt"),
+            ".gt.txt",
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_basic_autodetect():
+    """Test autodetect: User gives directories, suffixes are autodetected if possible"""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            os.path.join(data_dir, "line_dirs/basic/gt"),
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs():
+    """Test the dumb method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
+            ".gt.txt",
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs_autodetect():
+    """Test the autodetect method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_merged():
+    """Test the dumb method: GT and OCR texts are in the same directories."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            os.path.join(data_dir, "line_dirs/merged"),
+            ".gt.txt",
+            os.path.join(data_dir, "line_dirs/merged"),
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@ -177,8 +177,20 @@ def test_text():
 def test_plain(tmp_path):
    with working_directory(tmp_path):
        with open("ocr.txt", "w") as ocrf:
-            ocrf.write("AAAAB")
+            ocrf.write("First, a line.\nAnd a second line.\n")

        result = plain_text("ocr.txt")
-        expected = "AAAAB"
+        expected = "First, a line.\nAnd a second line."
+        assert result == expected
+
+
+def test_plain_BOM(tmp_path):
+    """Test that plain text files with BOM are read correctly."""
+    BOM = "\ufeff"
+    with working_directory(tmp_path):
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write(BOM + "First, a line.\nAnd a second line.\n")
+
+        result = plain_text("ocr.txt")
+        expected = "First, a line.\nAnd a second line."
        assert result == expected
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@ -1,7 +1,5 @@
-from __future__ import division
-
 import unicodedata
-from typing import Iterable, Tuple
+from typing import Generator, Iterable, Tuple, TypeVar

 import uniseg.wordbreak
 from multimethod import multimethod
@ -9,6 +7,8 @@ from rapidfuzz.distance import Levenshtein

 from .extracted_text import ExtractedText

+T = TypeVar("T")
+
 # Did we patch uniseg.wordbreak.word_break already?
 word_break_patched = False

@ -21,12 +21,17 @@ def patch_word_break():
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
    old_word_break = uniseg.wordbreak.word_break
+    if hasattr(uniseg.wordbreak, 'Word_Break'):
+        aletter = uniseg.wordbreak.Word_Break.ALetter
+    else:
+        # uniseg<0.9
+        aletter = uniseg.wordbreak.WordBreak.ALETTER

-    def new_word_break(c, index=0):
+    def new_word_break(c):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return "ALetter"
+            return aletter
        else:
-            return old_word_break(c, index)
+            return old_word_break(c)

    uniseg.wordbreak.word_break = new_word_break
    global word_break_patched
@ -34,7 +39,7 @@ def patch_word_break():


@multimethod
-def words(s: str):
+def words(s: str) -> Generator[str, None, None]:
    """Extract words from a string"""

    global word_break_patched
@ -54,7 +59,7 @@ def words(s: str):

    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
    # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
-    # only whitespace, punctation "or similar characters."
+    # only whitespace, punctuation "or similar characters."
    for word in uniseg.wordbreak.words(s):
        if all(unwanted(c) for c in word):
            pass
@ -62,37 +67,37 @@ def words(s: str):
            yield word


-@multimethod
-def words(s: ExtractedText):
-    return words(s.text)
+@words.register
+def _(s: ExtractedText) -> Generator[str, None, None]:
+    yield from words(s.text)


@multimethod
-def words_normalized(s: str):
-    return words(unicodedata.normalize("NFC", s))
+def words_normalized(s: str) -> Generator[str, None, None]:
+    yield from words(unicodedata.normalize("NFC", s))


-@multimethod
-def words_normalized(s: ExtractedText):
-    return words_normalized(s.text)
+@words_normalized.register
+def _(s: ExtractedText) -> Generator[str, None, None]:
+    yield from words_normalized(s.text)


@multimethod
 def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
    reference_seq = list(words_normalized(reference))
    compared_seq = list(words_normalized(compared))
-    return word_error_rate_n(reference_seq, compared_seq)
+    wer, n = word_error_rate_n(reference_seq, compared_seq)
+    return wer, n


-@multimethod
-def word_error_rate_n(
-    reference: ExtractedText, compared: ExtractedText
-) -> Tuple[float, int]:
-    return word_error_rate_n(reference.text, compared.text)
+@word_error_rate_n.register
+def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
+    wer, n = word_error_rate_n(reference.text, compared.text)
+    return wer, n


-@multimethod
-def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
+@word_error_rate_n.register
+def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
    reference_seq = list(reference)
    compared_seq = list(compared)

@ -106,6 +111,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
    return d / n, n


-def word_error_rate(reference, compared) -> float:
+def word_error_rate(reference: T, compared: T) -> float:
+    wer: float
    wer, _ = word_error_rate_n(reference, compared)
    return wer
Author	SHA1	Message	Date
Mike Gerber	b5e99d96c9	Merge pull request #144 from qurator-spk/fix/make-test-results-clearer ✔ GitHub Actions: Make reporting results clearer	7 days ago
Mike Gerber	774790c36f	✔ GitHub Actions: Make reporting results clearer In the "Actions" tab on GitHub, the workflow run that would post test results to the _original_ workflow run is named "Test Report". This would lead me to click on it to see the results, just to be disappointed. This aims to make the naming of the GitHub workflows/jobs clearer.	7 days ago
Mike Gerber	addb572922	Merge pull request #143 from qurator-spk/chore/update-pre-commit ⚙ pre-commit: update	7 days ago
Mike Gerber	1ebb004386	⚙ pre-commit: update	7 days ago
Mike Gerber	c3aa48ec3b	Merge branch 'master' of https://github.com/qurator-spk/dinglehopper	1 week ago
Mike Gerber	628594ef98	📦 v0.11.0	1 week ago
Mike Gerber	d7814db705	Merge pull request #142 from qurator-spk/feat/flex-line-dirs Feat/flex line dirs	1 week ago
Mike Gerber	5639f3db7f	✔ Add a tests that checks if plain text files with BOM are read correctly	1 week ago
Mike Gerber	9fc8937324	✒ README: Mention dinglehopper-line-dirs --help	1 week ago
Mike Gerber	14a4bc56d8	🐛 Add --plain-encoding option to dinglehopper-extract	1 week ago
Mike Gerber	a70260c10e	🐛 Use warning() to fix DeprecationWarning	1 week ago
Gerber, Mike	224aa02163	🚧 Fix help text	1 week ago
Gerber, Mike	9db5b4caf5	🚧 Add OCR-D parameter for plain text encoding	1 week ago
Gerber, Mike	5578ce83a3	🚧 Add option for text encoding to line dir cli	1 week ago
Gerber, Mike	cf59b951a3	🚧 Add option for text encoding to line dir cli	1 week ago
Gerber, Mike	480b3cf864	✔ Test that CLI produces a complete HTML report	1 week ago
Gerber, Mike	f1a586cff1	✔ Test line dirs CLI	1 week ago
Gerber, Mike	3b16c14c16	✔ Properly test line dir finding	1 week ago
Gerber, Mike	322faeb26c	🎨 Sort imports	1 week ago
Gerber, Mike	c37316da09	🐛 cli_line_dirs: Fix word differences section At the time of generation of the section, the {gt,ocr}_words generators were drained. Fix by using a list. Fixes gh-124.	1 week ago
Gerber, Mike	9414a92f9f	🐛 cli_line_dirs: Type-annotate functions	1 week ago
Gerber, Mike	68344e48f8	🎨 Reformat cli_line_dirs	1 week ago
Gerber, Mike	73ee16fe51	🚧 Support 'merged' GT+OCR line directories	1 week ago
Gerber, Mike	6980d7a252	🚧 Use our own removesuffix() as we still support Python 3.8	1 week ago
Gerber, Mike	2bf2529c38	🚧 Port new line dir functions	1 week ago
Gerber, Mike	ad8e6de36b	🐛 cli_line_dirs: Fix character diff reports	1 week ago
Gerber, Mike	4024e350f7	🚧 Test new flexible line dirs functions	1 week ago
Mike Gerber	3c317cbeaf	Merge pull request #141 from qurator-spk/chore/update-pre-commit ⚙ pre-commit: update	1 week ago
Mike Gerber	d8403421fc	⚙ pre-commit: update	1 week ago
Mike Gerber	3305043234	Merge pull request #140 from qurator-spk/fix/vendor-strings 🐛 Fix vendor strings	1 week ago
Mike Gerber	6bf5bd7178	🐛 Fix vendor strings	1 week ago
Mike Gerber	817e0c95f7	📦 v0.10.1	1 week ago
Mike Gerber	3d7c7ee1e3	Merge pull request #139 from bertsky/allow-uniseg-py38 re-allow uniseg 0.8 and py38	1 week ago
Robert Sachunsky	a24623b966	re-allow py38	2 weeks ago
Robert Sachunsky	ea33602336	CI: reactivate py38	2 weeks ago
Robert Sachunsky	64444dd419	opt out of `7f8a8dd5` (uniseg update that requires py39)	2 weeks ago
Mike Gerber	f6dfb77f94	🐛 pyproject.toml: Fix description	2 weeks ago
Mike Gerber	ef817cb343	📦 v0.10.0	2 weeks ago
Mike Gerber	b1c109baae	Merge pull request #128 from kba/v3-api V3 api	2 weeks ago
Mike Gerber	13ab1ae150	🐛 Docker: Use same vendor as license for now	2 weeks ago
Mike Gerber	d974369e13	🐛 Docker: Fix description	2 weeks ago
Mike Gerber	b7bdca4ac8	🐛 Makefile: Make phony targets .PHONY	2 weeks ago
kba	831a24fc4c	typo: report_prefix -> file_id	2 weeks ago
Konstantin Baierer	f6a2c94520	ocrd_cli: but do check for existing output files Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2 weeks ago
Konstantin Baierer	4162836612	ocrd_cli: no need to check fileGrp dir exists Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2 weeks ago
Konstantin Baierer	c0aa82d188	OCR-D processor: properly handle missing or non-downloaded GT/OCR file Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2 weeks ago
kba	8c1b6d65f5	Dockerfile: build ocrd-all-tool.json	2 weeks ago
Mike Gerber	f287386c0e	🧹Don't pin uniseg and rapidfuzz Breakage with the newest uniseg API was fixed in master. Can't see any issue with rapidfuzz, so removing that pin, too.	2 weeks ago
kba	63031b30bf	Port to OCR-D/core API v3	2 weeks ago
Mike Gerber	bf6633be02	Merge pull request #136 from qurator-spk/chore/update-liccheck ⚙ liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl)	2 weeks ago
Mike Gerber	d3aa9eb520	⚙ liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl)	2 weeks ago
Mike Gerber	625686f204	Merge pull request #135 from qurator-spk/chore/update-python-version ⚙ pyproject.toml: Update supported Python version	2 weeks ago
Mike Gerber	ce7886af23	⚙ pyproject.toml: Update supported Python version	2 weeks ago
Mike Gerber	a09a624bde	Merge pull request #132 from qurator-spk/fix/uniseg-removed-index-parameter 🐛 Fix for changed API of uniseg's word_break	2 weeks ago
Mike Gerber	badfa9c99e	⚙ GitHub Actions: Don't test on Python 3.8 anymore	2 weeks ago
Mike Gerber	7f8a8dd564	🐛 Fix for changed API of uniseg's word_break	2 weeks ago
Mike Gerber	b72d4f5af9	Merge pull request #131 from qurator-spk/chore/update-pre-commit ⚙ pre-commit: update	2 weeks ago
Mike Gerber	058042accb	⚙ pre-commit: update	2 weeks ago
Mike Gerber	071e6a8bd1	Merge pull request #120 from joschrew/dockerfile Add Dockerfile and Makefile to create ocr-d dockerimage	7 months ago
Mike Gerber	6b82293670	Update Dockerfile I fancy-clicked @bertsky's change suggestion, which duplicated some labels. Now fancy-clicking the fix, fingers crossed...	7 months ago
Mike Gerber	6ecf49a355	Update Dockerfile Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	7 months ago
joschrew	9c7c104dce	Add Dockerfile and Makefile to create ocr-d image	7 months ago
Mike Gerber	2e6fe0c279	Merge pull request #113 from qurator-spk/python-3.13 ✔ Test on Python 3.13	8 months ago
Mike Gerber	1753ed4d13	✔ Test on Python 3.13	8 months ago
Mike Gerber	3233dbcc8f	✔ pre-commit: Add license check	9 months ago
Mike Gerber	f2e290dffe	🐛 Fix --version option in OCR-D CLI	10 months ago
Mike Gerber	6d1daf1dfe	✨ Support --version option in CLI	10 months ago
Mike Gerber	27ad145c7e	⚙ pyproject.toml: Add license.file	10 months ago
Mike Gerber	2e9e88cc1e	⚙ pre-commit: Update hooks	10 months ago
Mike Gerber	129e6eb427	📦 v0.9.7	10 months ago
Mike Gerber	cf998443c1	⚙ ruff: Update settings (select → lint.select)	10 months ago
Mike Gerber	6048107889	Merge branch 'master' of https://github.com/qurator-spk/dinglehopper	10 months ago
Mike Gerber	2ee37ed4e3	🎨 Sort imports	10 months ago
Mike Gerber	521f034fba	Merge pull request #116 from stweil/master Fix typo	10 months ago
Mike Gerber	d1a2247615	⚙ pre-commit: Update hooks	10 months ago
Mike Gerber	4047f8b6e5	🐛 Fix loading ocrd-tool.json for Python 3.12	10 months ago
Stefan Weil	cd68a973cb	Fix typo Signed-off-by: Stefan Weil <sw@weilnetz.de>	11 months ago
Mike Gerber	bc5818da9f	✔ GitHub Actions: Update used actions	12 months ago
Mike Gerber	c91234daba	✔ GitHub Actions: Update used actions	12 months ago
Mike Gerber	a534b5e28e	⚙ pre-commit: Update hooks	12 months ago
Mike Gerber	b336f98271	🐛 Fix reading plain text files As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107.	1 year ago
Mike Gerber	41a0fad352	📦 v0.9.6	1 year ago
Mike Gerber	e72d1e37ea	Revert "✔ Test on Python 3.13" This reverts commit `0d5c6d5a62`.	1 year ago
Mike Gerber	86e723cd53	🐛 GHA: Install possible shapely build requirements (if building from source)	1 year ago
Mike Gerber	dc4565fd2d	Merge pull request #111 from stweil/typos Fix some typos (found by `codespell` and `typos`)	1 year ago
Mike Gerber	fbcb9160fd	🐛 GHA: Install possible lxml build requirements (if building from source)	1 year ago
Mike Gerber	0d5c6d5a62	✔ Test on Python 3.13	1 year ago
Mike Gerber	e34adbf41c	🐛 Fix Python 3.12 support by requiring ocrd >= 2.65.0	1 year ago
Mike Gerber	58a688b175	⚙ pre-commit: Update hooks	1 year ago
Stefan Weil	79701e410d	Fix some typos (found by `codespell` and `typos`) Signed-off-by: Stefan Weil <sw@weilnetz.de>	1 year ago
Mike Gerber	2383730a55	✔ Test using empty files Test edge cases + empty files, e.g. empty text content and a Unicode BOM character. See also gh-79.	1 year ago
Mike Gerber	98d7928f45	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	edabffec7e	🧹 tests: Move comment out of the code (bad style + weird formatting)	1 year ago
Mike Gerber	32d4037533	⚙ cli: Annotate types in process_dir()	1 year ago
Mike Gerber	fe1a713d55	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	be7c1dd25d	🧹 Make from_text_segment()'s textequiv_level keyword-only	1 year ago
Mike Gerber	932bfafc7d	🧹 Make process_dir() keyword arguments keyword-only	1 year ago
Mike Gerber	945aec5673	✒ README-DEV: Releasing a new version	1 year ago
Mike Gerber	c29a80bc81	📦 v0.9.5	1 year ago
Mike Gerber	a1c1d0ad49	⚙ pre-commit: Add mypy dependencies Closes gh-106.	1 year ago
Mike Gerber	5d9f0c482f	🐛 Check that we always get a valid ALTO namespace (satifies mypy)	1 year ago
Mike Gerber	19d1a00817	🎨 Reformat (Black)	1 year ago
Mike Gerber	4dc6b7dc04	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	6b3697c864	Merge branch 'master' of https://github.com/qurator-spk/dinglehopper	1 year ago
Mike Gerber	4d4ead4cc8	🐛 Fix word segmentation with uniseg 0.8.0	1 year ago
Mike Gerber	0e3d24cac1	🐛 README.md: Fix badge (for real)	1 year ago
Mike Gerber	4016c01638	🐛 README.md: Fix test badge	1 year ago
Mike Gerber	4b64398cec	🚧 GitLab CI Test: Depend on child pipeline	1 year ago
Mike Gerber	7e033b6f03	🚧 GitLab CI Test: Depend on child pipeline	1 year ago
Mike Gerber	250ee2b7f2	🚧 GitLab CI Test: Push after pulling	1 year ago
Mike Gerber	76c4533aa5	🚧 GitLab CI Test: Push after pulling	1 year ago
Mike Gerber	f8e31089b3	🚧 GitLab CI Test: Push after pulling	1 year ago
Mike Gerber	6cfb49fe39	🚧 GitLab CI Test: Push after pulling	1 year ago
Mike Gerber	5eba65f097	🚧 GitLab CI Test: Trigger only on default branch (and do not hardcode it)	1 year ago
Mike Gerber	83cef3106f	🚧 GitLab CI Test	1 year ago
Mike Gerber	a95a85a889	🚧 GitLab CI Test	1 year ago
Mike Gerber	ff34c65c1e	🔍 ruff: Remove ignore configuration, we use multimethods in a compatible way now	1 year ago
Mike Gerber	21c44d426e	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	10ccba989e	🚧 GitLab CI Test	1 year ago
Mike Gerber	10d423f045	🚧 GitLab CI Test	1 year ago
Mike Gerber	6d947a9ca9	🚧 GitLab CI Test	1 year ago
Mike Gerber	484da90d27	🚧 GitLab CI Test	1 year ago
Mike Gerber	d0ddfa68a1	🚧 GitLab CI Test	1 year ago
Mike Gerber	81391132f0	🚧 GitLab CI Test	1 year ago
Mike Gerber	dc390cd3f8	🚧 GitLab CI Test	1 year ago
Mike Gerber	c77e8f51ab	🚧 GitLab CI Test	1 year ago
Mike Gerber	e083688c66	🚧 GitLab CI Test	1 year ago
Mike Gerber	6d8afc27b3	🚧 GitLab CI Test	1 year ago
Mike Gerber	af83b35f23	🚧 GitLab CI Test	1 year ago
Mike Gerber	344f96dca9	🚧 GitLab CI Test	1 year ago
Mike Gerber	483e809691	🔍 mypy: Use an almost strict mypy configuration, and fix any issues	1 year ago
Mike Gerber	ad316aeabc	🔍 mypy: Use a compatible syntax for multimethod	1 year ago
Mike Gerber	8166435958	🔍 mypy: Remove ExtractedText.segments converter	1 year ago
Mike Gerber	24c25b6fcd	🔍 mypy: Avoid using check() for all attr validators	1 year ago
Mike Gerber	ac9d360dcd	🔍 mypy: Make cli.process() typed so mypy checks it (and issues no warning)	1 year ago
Mike Gerber	788868b2ac	Merge branch 'pr103'	1 year ago
Mike Gerber	59a3882ce5	🧹 GitHub Actions: Clean up whitespace	1 year ago
Sadra Barikbin	4466422cda	Fix a typo	1 year ago
Sadra Barikbin	967f833eac	Improve report	1 year ago
Sadra Barikbin	f4ff6a8f31	Change reporter	1 year ago
Sadra Barikbin	4413ddac8f	Temporary commit	1 year ago
Sadra Barikbin	6884c5c825	Update dorny dependency	1 year ago
Sadra Barikbin	c90a61c12c	Fix a few typos	1 year ago
Sadra Barikbin	bf47308c00	Add report_tests workflow	1 year ago
Mike Gerber	4bf123de43	⚙ Update ruff+mypy dependencies	1 year ago
Mike Gerber	b36727ed9e	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	7a192880f1	⬆ Move on to supporting Python >= 3.8 only	1 year ago
Mike Gerber	c752793be6	🐛 Use typing.List instead of list, for Python <3.9	1 year ago
Mike Gerber	071766efc2	🐛 Use Optional instead of \| none, for Python <3.10	1 year ago
Mike Gerber	4832d1542f	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	c1681551af	🐛 Fix generating word differences	1 year ago
Mike Gerber	44bd4b5eda	⚙ pre-commit: Update hooks	1 year ago
Mike Gerber	296a820990	Merge branch 'master' of https://github.com/qurator-spk/dinglehopper	1 year ago
Mike Gerber	38fcbc8e1c	Merge branch 'master' into performance	1 year ago
Mike Gerber	d3fb3f96cf	Merge pull request #101 from sadra-barikbin/patch-1 Fix a tiny typo in Levenshtein notebook	1 year ago
Sadra Barikbin	b0e906ad00	Update Levenshtein.ipynb Fix a tiny typo in Levenshtein notebook.	1 year ago
Mike Gerber	68a12f8f7f	⬆ Update uniseg dependency @maxbachmann also improved the performance of uniseg, and it is in 0.7.2 - update our dependency.	2 years ago
Mike Gerber	de6cd8f1e7	❎ Make joining grapheme clusters more robust by checking joiner and handling an empty joiner	2 years ago
Mike Gerber	7c6ee593f0	🐛 Fix score_hint call in cli_line_dirs	2 years ago
Mike Gerber	618ea567de	🐛 Fix docstring of distance() for grapheme clusters	2 years ago
Mike Gerber	e256526ea1	🐛 Fix calculation of score_hint for edge cases, e.g. when CER is infinite If the CER is infinite, we can't calculate a score_hint as an int. Fall back to None in this case.	2 years ago
Mike Gerber	bc95c03127	🕸Do not use deprecated ID, pageId options See gh-75.	2 years ago
Mike Gerber	7fef02bf0a	✔ Add mets:FLocat's @LOCTYPE/OTHERLOCTYPE to test data Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so adding it. This fixes the test again.	2 years ago
Mike Gerber	7ed076d3c1	⬆ Update multimethod dependency We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore, so lifting the hard pin on multimethod 1.3.	2 years ago
Mike Gerber	f077ce2e1b	🐛 dinglehopper-summarize: Handle reports without difference stats	2 years ago
Mike Gerber	39dc4186d6	Merge pull request #97 from qurator-spk/clean-remove-six-dep-again 🧹 Remove old six dependency (workaround for OCR-D/core#730)	2 years ago
Mike Gerber	d776368484	Merge pull request #96 from qurator-spk/test-on-pr-but-really 🐛 (Hopefully) Fix running tests on PR	2 years ago
Mike Gerber	3f8c8e69aa	🐛 (Hopefully) Fix running tests on PR	2 years ago
Mike Gerber	d8f84ec9ac	🧹 Remove old six dependency (workaround for OCR-D/core#730 )	2 years ago
Mike Gerber	df1d4d09f3	Merge pull request #94 from qurator-spk/test-on-pr ✔ GitHub Actions: Test on PR	2 years ago
Mike Gerber	e7e0703d9d	✔ GitHub Actions: Test on PR	2 years ago
Mike Gerber	22e7247ac4	Merge pull request #93 from qurator-spk/update-dep-multimethod ⬆ Update multimethod dependency	2 years ago
Mike Gerber	1c3b28d873	⬆ Update multimethod dependency We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore, so lifting the hard pin on multimethod 1.3.	2 years ago
Mike Gerber	05b5502c57	Merge pull request #92 from qurator-spk/update-pre-commit Update pre commit	2 years ago
Mike Gerber	fe60361e8d	✒ README-DEV: Make pre-commit section top-level (+ small whitespace fix)	2 years ago
Mike Gerber	8a1ea4ec93	🎨 Add newlines at end of files (ruff)	2 years ago
Mike Gerber	4e0d4dcf09	⚙ pre-commit: Add pre-commit-update hook (to update hooks using pre-commit)	2 years ago
Mike Gerber	061ba16461	⚙ pre-commit: Update hooks	2 years ago
Mike Gerber	0c727dca9d	Merge pull request #91 from qurator-spk/test-remove-circleci ✔ Remove CircleCI config	2 years ago
Mike Gerber	1b7c2a61a3	✔ Remove CircleCI config	2 years ago
Mike Gerber	994a27d458	Merge pull request #90 from qurator-spk/test-on-python-3.12 ✔ GitHub Actions: Test on Python 3.12	2 years ago
Mike Gerber	5450f193e4	✔ GitHub Actions: Test on Python 3.12	2 years ago
Mike Gerber	9d862e418b	✔ Add mets:FLocat's @LOCTYPE/OTHERLOCTYPE to test data Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so adding it. This fixes the test again.	2 years ago
Mike Gerber	dbaccdd5e3	✒ README: Minor whitespace cleanup	2 years ago
Mike Gerber	54a3121172	✒ README: Recommend installing via pip and from PyPI	2 years ago
Mike Gerber	a1a7f95ac6	📦 v0.9.4	2 years ago
Mike Gerber	1e7c46285b	🎨 editorconfig: *.json should have a final newline	2 years ago
Mike Gerber	9594b4c9d2	🧹 pyproject: Remove extra *.json	2 years ago
Mike Gerber	de70b198ac	🧹 Remove empty setup.cfg	2 years ago
Mike Gerber	6c70afbbc5	📦 v0.9.3	2 years ago
Mike Gerber	12b1ea3ae7	🐛 Remove MANIFEST.in workaround, now that setuptools_ocrd is fixed	2 years ago
Mike Gerber	98a67c7b3b	📦 v0.9.2	2 years ago
Mike Gerber	668072e338	🧹 .gitignore dist/	2 years ago
Mike Gerber	563642c93b	🐛 Workaround sdist not containing top-level ocrd-tool.json See https://github.com/qurator-spk/setuptools_ocrd/issues/10 - The sdist does not contain ocrd-tool.json, so that the wheel built from it does not get the proper version. Needs to be fixed in setuptools_ocrd, then MANIFEST.in can be removed again.	2 years ago
Gerber, Mike	a18b25b163	🐛 Update tests for ExtractedText In PR gh-72, @maxbachmann introduced a new argument for ExtractedText(). Update the corresponding tests.	2 years ago
Max Bachmann	f48e305347	use uniseg again	3 years ago
Max Bachmann	d2bbc8a6c7	update rapidfuzz version	3 years ago
Max Bachmann	a1f0a5e2d3	replace uniseg with uniseg2	3 years ago
Max Bachmann	22c3817f45	apply black	3 years ago
Max Bachmann	01571f23b7	move grapheme clusters to ExtractedText	3 years ago
Max Bachmann	f211d09f56	remove python2.7 futures	3 years ago
Max Bachmann	205a969c0e	remove unused includes	3 years ago
Max Bachmann	f3825cdeb6	only call `words_normalized` once	3 years ago