From f3825cdeb67236e39bbef271d01970f5fcf254a2 Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 00:22:23 +0200
Subject: [PATCH 001/176] only call `words_normalized` once

---
 qurator/dinglehopper/cli.py           | 7 +++----
 qurator/dinglehopper/cli_line_dirs.py | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 72d428d..be6f020 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -106,16 +106,15 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
 
     gt_text = extract(gt, textequiv_level=textequiv_level)
     ocr_text = extract(ocr, textequiv_level=textequiv_level)
+    gt_words = words_normalized(gt_text)
+    ocr_words = words_normalized(ocr_text)
 
     cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    wer, n_words = word_error_rate_n(gt_text, ocr_text)
-
     char_diff_report = gen_diff_report(
         gt_text, ocr_text, css_prefix="c", joiner="", none="·"
     )
 
-    gt_words = words_normalized(gt_text)
-    ocr_words = words_normalized(ocr_text)
+    wer, n_words = word_error_rate_n(gt_words, ocr_words)
     word_diff_report = gen_diff_report(
         gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
     )
diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 4c07ce5..06d12b8 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -53,6 +53,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 
         gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
         ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
+        gt_words = words_normalized(gt_text)
+        ocr_words = words_normalized(ocr_text)
 
         # Compute CER
         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@@ -64,7 +66,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             n_characters = n_characters + l_n_characters
 
         # Compute WER
-        l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
+        l_wer, l_n_words = word_error_rate_n(gt_words, ocr_words)
         if wer is None:
             wer, n_words = l_wer, l_n_words
         else:
@@ -76,8 +78,6 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
         char_diff_report += gen_diff_report(
             gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
         )
-        gt_words = words_normalized(gt_text)
-        ocr_words = words_normalized(ocr_text)
         word_diff_report += gen_diff_report(
             gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
         )

From 205a969c0e1fc0ce8b03cf3374edab21e9209385 Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 00:48:40 +0200
Subject: [PATCH 002/176] remove unused includes

---
 qurator/dinglehopper/cli_extract.py   | 3 ---
 qurator/dinglehopper/cli_line_dirs.py | 6 ------
 qurator/dinglehopper/edit_distance.py | 5 -----
 qurator/dinglehopper/ocr_files.py     | 1 -
 4 files changed, 15 deletions(-)

diff --git a/qurator/dinglehopper/cli_extract.py b/qurator/dinglehopper/cli_extract.py
index 0d4f713..9c51d34 100644
--- a/qurator/dinglehopper/cli_extract.py
+++ b/qurator/dinglehopper/cli_extract.py
@@ -1,9 +1,6 @@
-import os
-
 import click
 from ocrd_utils import initLogging
 
-from .extracted_text import ExtractedText
 from .ocr_files import extract
 
 
diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 06d12b8..59c4a1f 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -1,19 +1,13 @@
 import os
-import sys
 import itertools
 
 import click
 from jinja2 import Environment, FileSystemLoader
-from markupsafe import escape
-from uniseg.graphemecluster import grapheme_clusters
 from ocrd_utils import initLogging
 
 from .character_error_rate import character_error_rate_n
 from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
-from .extracted_text import ExtractedText
 from .ocr_files import plain_extract
-from .config import Config
 from .cli import gen_diff_report, json_float
 
 
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 24f6928..531beeb 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -1,17 +1,12 @@
 from __future__ import division, print_function
 
 import unicodedata
-from functools import partial, lru_cache
-from typing import Sequence, Tuple
 
-import numpy as np
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
-from tqdm import tqdm
 from rapidfuzz.distance import Levenshtein
 
 from .extracted_text import ExtractedText
-from .config import Config
 
 
 @multimethod
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 69f4df7..94c34d7 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -3,7 +3,6 @@ from __future__ import division, print_function
 import os
 import sys
 from typing import Iterator
-from warnings import warn
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError

From f211d09f5627100e4299dd47d56f174d53da64cb Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 00:50:33 +0200
Subject: [PATCH 003/176] remove python2.7 futures

---
 qurator/dinglehopper/character_error_rate.py | 2 --
 qurator/dinglehopper/edit_distance.py        | 2 --
 qurator/dinglehopper/ocr_files.py            | 2 --
 qurator/dinglehopper/word_error_rate.py      | 2 --
 4 files changed, 8 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 0c3ef7d..2128a9f 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import unicodedata
 from typing import Tuple
 
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 531beeb..3adb059 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function
-
 import unicodedata
 
 from multimethod import multimethod
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 94c34d7..92f4fe5 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -1,5 +1,3 @@
-from __future__ import division, print_function
-
 import os
 import sys
 from typing import Iterator
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 8f0cc96..0976921 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod

From 01571f23b74ca4e13e0934b8ac0cf0100956e31c Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 01:49:04 +0200
Subject: [PATCH 004/176] move grapheme clusters to ExtractedText

---
 qurator/dinglehopper/character_error_rate.py | 13 +++++--
 qurator/dinglehopper/cli.py                  |  6 ++--
 qurator/dinglehopper/edit_distance.py        | 12 ++++++-
 qurator/dinglehopper/extracted_text.py       | 37 ++++++++++++++++----
 qurator/dinglehopper/ocr_files.py            | 25 ++++++++-----
 5 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 2128a9f..7116660 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -9,7 +9,7 @@ from .extracted_text import ExtractedText
 
 
 @multimethod
-def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
+def character_error_rate_n(reference: list[str], compared: list[str]) -> Tuple[float, int]:
     """
     Compute character error rate.
 
@@ -17,7 +17,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
     """
 
     d = distance(reference, compared)
-    n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
+    n = len(reference)
 
     if d == 0:
         return 0, n
@@ -28,11 +28,18 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
     # XXX Should we really count newlines here?
 
 
+@multimethod
+def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
+    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", reference)))
+    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", compared)))
+    return character_error_rate_n(seq1, seq2)
+
+
 @multimethod
 def character_error_rate_n(
     reference: ExtractedText, compared: ExtractedText
 ) -> Tuple[float, int]:
-    return character_error_rate_n(reference.text, compared.text)
+    return character_error_rate_n(reference.grapheme_clusters, compared.grapheme_clusters)
 
 
 def character_error_rate(reference, compared) -> float:
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index be6f020..3c52c5d 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -3,7 +3,6 @@ import os
 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
-from uniseg.graphemecluster import grapheme_clusters
 from ocrd_utils import initLogging
 
 from .character_error_rate import character_error_rate_n
@@ -45,9 +44,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
     if isinstance(gt_in, ExtractedText):
         if not isinstance(ocr_in, ExtractedText):
             raise TypeError()
-        # XXX splitting should be done in ExtractedText
-        gt_things = list(grapheme_clusters(gt_in.text))
-        ocr_things = list(grapheme_clusters(ocr_in.text))
+        gt_things = gt_in.grapheme_clusters
+        ocr_things = ocr_in.grapheme_clusters
     else:
         gt_things = gt_in
         ocr_things = ocr_in
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 3adb059..ad8eaf2 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -7,6 +7,16 @@ from rapidfuzz.distance import Levenshtein
 from .extracted_text import ExtractedText
 
 
+@multimethod
+def distance(seq1: list[str], seq2: list[str]):
+    """Compute the Levenshtein edit distance between two Unicode strings
+
+    Note that this is different from levenshtein() as this function knows about Unicode
+    normalization and grapheme clusters. This should be the correct way to compare two
+    Unicode strings.
+    """
+    return Levenshtein.distance(seq1, seq2)
+
 @multimethod
 def distance(s1: str, s2: str):
     """Compute the Levenshtein edit distance between two Unicode strings
@@ -22,7 +32,7 @@ def distance(s1: str, s2: str):
 
 @multimethod
 def distance(s1: ExtractedText, s2: ExtractedText):
-    return distance(s1.text, s2.text)
+    return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
 
 
 def editops(word1, word2):
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 9703b6b..0ddebf5 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -9,6 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
+from uniseg.graphemecluster import grapheme_clusters
 
 
 class Normalization(enum.Enum):
@@ -133,6 +134,7 @@ class ExtractedText:
     segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
     joiner = attr.ib(type=Optional[str])
     _text = attr.ib(type=Optional[str])
+    _grapheme_clusters = attr.ib(type=Optional[list[str]])
 
     @segments.validator
     def check(self, _, value):
@@ -141,12 +143,22 @@ class ExtractedText:
 
     @_text.validator
     def check(self, _, value):
-        if value is not None and self.segments is not None:
+        if value is None:
+            return
+
+        if self.segments is not None:
             raise ValueError("Can't have both segments and text")
-        if value is not None and unicodedata.normalize("NFC", value) != value:
+        if unicodedata.normalize("NFC", value) != value:
             raise ValueError('String "{}" is not in NFC.'.format(value))
-        if value is not None and normalize(value, self.normalization) != value:
+        if normalize(value, self.normalization) != value:
             raise ValueError('String "{}" is not normalized.'.format(value))
+        if self._grapheme_clusters is None:
+            raise ValueError("Requires both text and grapheme clusters to be set")
+
+    @_grapheme_clusters.validator
+    def check(self, _, value):
+        if value is not None and self._text is None:
+            raise ValueError("Requires both text and grapheme clusters to be set")
 
     normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
 
@@ -157,6 +169,17 @@ class ExtractedText:
         else:
             return self.joiner.join(s.text for s in self.segments)
 
+    @property
+    def grapheme_clusters(self):
+        if self._text is not None:
+            return self._grapheme_clusters
+        else:
+            clusters = []
+            for seg in  self.segments:
+                # todo could there be cases where joiner is no grapheme cluster?
+                clusters.extend(seg.grapheme_clusters + [self.joiner])
+            return clusters[:-1]
+
     _segment_id_for_pos = None
 
     def segment_id_for_pos(self, pos):
@@ -197,7 +220,8 @@ class ExtractedText:
                 # FIXME hardcoded SBB normalization
                 segment_text = normalize_sbb(segment_text)
             segment_text = segment_text or ""
-            return cls(segment_id, None, None, segment_text)
+            clusters = list(grapheme_clusters(segment_text))
+            return cls(segment_id, None, None, segment_text, clusters)
         else:
             # Recurse
             sub_localname = children_for_localname[localname]
@@ -212,12 +236,13 @@ class ExtractedText:
                     )
                 )
             joiner = joiner_for_textequiv_level[sub_textequiv_level]
-            return cls(segment_id, segments, joiner, None)
+            return cls(segment_id, segments, joiner, None, None)
 
     @classmethod
     def from_str(cls, text, normalization=Normalization.NFC_SBB):
         normalized_text = normalize(text, normalization)
-        return cls(None, None, None, normalized_text, normalization=normalization)
+        clusters = list(grapheme_clusters(normalized_text))
+        return cls(None, None, None, normalized_text, clusters, normalization=normalization)
 
 
 def invert_dict(d):
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 92f4fe5..38190da 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -4,6 +4,7 @@ from typing import Iterator
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
@@ -29,13 +30,15 @@ def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]:
             string.attrib.get("CONTENT")
             for string in line.iterfind("alto:String", namespaces=nsmap)
         )
-        yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
+        normalized_text = normalize_sbb(line_text)
+        clusters = list(grapheme_clusters(normalized_text))
+        yield ExtractedText(line_id, None, None, normalized_text, clusters)
         # FIXME hardcoded SBB normalization
 
 
 def alto_extract(tree: ET.ElementTree) -> ExtractedText:
     """Extract text from the given ALTO ElementTree."""
-    return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
+    return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None, None)
 
 
 def alto_text(tree):
@@ -83,7 +86,7 @@ def page_extract(tree, *, textequiv_level="region"):
     # Filter empty region texts
     regions = [r for r in regions if r.text != ""]
 
-    return ExtractedText(None, regions, "\n", None)
+    return ExtractedText(None, regions, "\n", None, None)
 
 
 def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
@@ -130,17 +133,21 @@ def page_text(tree, *, textequiv_level="region"):
 
 def plain_extract(filename, include_filename_in_id=False):
     id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
+
+    def make_segment(no, line):
+        normalized_text = normalize_sbb(line)
+        clusters = list(grapheme_clusters(normalized_text))
+        return ExtractedText(
+            id_template.format(filename=os.path.basename(filename), no=no),
+            None, None, normalized_text, clusters)
+
     with open(filename, "r") as f:
         return ExtractedText(
             None,
-            [
-                ExtractedText(
-                    id_template.format(filename=os.path.basename(filename), no=no),
-                    None, None, normalize_sbb(line))
-                for no, line in enumerate(f.readlines())
-            ],
+            [make_segment(no, line) for no, line in enumerate(f.readlines())],
             "\n",
             None,
+            None
         )
     # XXX hardcoded SBB normalization
 

From 22c3817f453e4a0641681863bc7704f7a679601a Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 01:50:19 +0200
Subject: [PATCH 005/176] apply black

---
 qurator/dinglehopper/character_error_rate.py |  8 ++++++--
 qurator/dinglehopper/cli_line_dirs.py        | 10 +++++++---
 qurator/dinglehopper/edit_distance.py        |  1 +
 qurator/dinglehopper/extracted_text.py       |  6 ++++--
 qurator/dinglehopper/ocr_files.py            | 18 +++++++++++++-----
 qurator/dinglehopper/ocrd_cli.py             |  2 +-
 qurator/dinglehopper/word_error_rate.py      |  1 -
 7 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 7116660..3b8c0cc 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -9,7 +9,9 @@ from .extracted_text import ExtractedText
 
 
 @multimethod
-def character_error_rate_n(reference: list[str], compared: list[str]) -> Tuple[float, int]:
+def character_error_rate_n(
+    reference: list[str], compared: list[str]
+) -> Tuple[float, int]:
     """
     Compute character error rate.
 
@@ -39,7 +41,9 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
 def character_error_rate_n(
     reference: ExtractedText, compared: ExtractedText
 ) -> Tuple[float, int]:
-    return character_error_rate_n(reference.grapheme_clusters, compared.grapheme_clusters)
+    return character_error_rate_n(
+        reference.grapheme_clusters, compared.grapheme_clusters
+    )
 
 
 def character_error_rate(reference, compared) -> float:
diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 59c4a1f..3f8e3fc 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -26,7 +26,7 @@ def common_suffix(its):
 
 def removesuffix(text, suffix):
     if suffix and text.endswith(suffix):
-        return text[:-len(suffix)]
+        return text[: -len(suffix)]
     return text
 
 
@@ -46,7 +46,9 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
         ocr = removesuffix(gt, gt_suffix) + ocr_suffix
 
         gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
-        ocr_text = plain_extract(os.path.join(ocr_dir, ocr), include_filename_in_id=True)
+        ocr_text = plain_extract(
+            os.path.join(ocr_dir, ocr), include_filename_in_id=True
+        )
         gt_words = words_normalized(gt_text)
         ocr_words = words_normalized(ocr_text)
 
@@ -56,7 +58,9 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             cer, n_characters = l_cer, l_n_characters
         else:
             # Rolling update
-            cer = (cer * n_characters + l_cer * l_n_characters) / (n_characters + l_n_characters)
+            cer = (cer * n_characters + l_cer * l_n_characters) / (
+                n_characters + l_n_characters
+            )
             n_characters = n_characters + l_n_characters
 
         # Compute WER
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index ad8eaf2..2120b80 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -17,6 +17,7 @@ def distance(seq1: list[str], seq2: list[str]):
     """
     return Levenshtein.distance(seq1, seq2)
 
+
 @multimethod
 def distance(s1: str, s2: str):
     """Compute the Levenshtein edit distance between two Unicode strings
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 0ddebf5..19ad9c1 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -175,7 +175,7 @@ class ExtractedText:
             return self._grapheme_clusters
         else:
             clusters = []
-            for seg in  self.segments:
+            for seg in self.segments:
                 # todo could there be cases where joiner is no grapheme cluster?
                 clusters.extend(seg.grapheme_clusters + [self.joiner])
             return clusters[:-1]
@@ -242,7 +242,9 @@ class ExtractedText:
     def from_str(cls, text, normalization=Normalization.NFC_SBB):
         normalized_text = normalize(text, normalization)
         clusters = list(grapheme_clusters(normalized_text))
-        return cls(None, None, None, normalized_text, clusters, normalization=normalization)
+        return cls(
+            None, None, None, normalized_text, clusters, normalization=normalization
+        )
 
 
 def invert_dict(d):
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 38190da..6384dfa 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -98,14 +98,18 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
 
         ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
         ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
-    elif ET.QName(group.tag).localname in ["UnorderedGroup","UnorderedGroupIndexed"]:
+    elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]:
         ro_children = list(group)
     else:
         raise NotImplementedError
 
-
     for ro_child in ro_children:
-        if ET.QName(ro_child.tag).localname in ["OrderedGroup", "OrderedGroupIndexed", "UnorderedGroup", "UnorderedGroupIndexed"]:
+        if ET.QName(ro_child.tag).localname in [
+            "OrderedGroup",
+            "OrderedGroupIndexed",
+            "UnorderedGroup",
+            "UnorderedGroupIndexed",
+        ]:
             regions.extend(
                 extract_texts_from_reading_order_group(
                     ro_child, tree, nsmap, textequiv_level
@@ -139,7 +143,11 @@ def plain_extract(filename, include_filename_in_id=False):
         clusters = list(grapheme_clusters(normalized_text))
         return ExtractedText(
             id_template.format(filename=os.path.basename(filename), no=no),
-            None, None, normalized_text, clusters)
+            None,
+            None,
+            normalized_text,
+            clusters,
+        )
 
     with open(filename, "r") as f:
         return ExtractedText(
@@ -147,7 +155,7 @@ def plain_extract(filename, include_filename_in_id=False):
             [make_segment(no, line) for no, line in enumerate(f.readlines())],
             "\n",
             None,
-            None
+            None,
         )
     # XXX hardcoded SBB normalization
 
diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py
index 7c513e6..9578a0a 100644
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@@ -33,7 +33,7 @@ class OcrdDinglehopperEvaluate(Processor):
         textequiv_level = self.parameter["textequiv_level"]
         gt_grp, ocr_grp = self.input_file_grp.split(",")
 
-        input_file_tuples = self.zip_input_files(on_error='abort')
+        input_file_tuples = self.zip_input_files(on_error="abort")
         for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
             if not gt_file or not ocr_file:
                 # file/page was not found in this group
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 0976921..3b9ff5e 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -40,7 +40,6 @@ def words(s: str):
     if not word_break_patched:
         patch_word_break()
 
-
     # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
     def unwanted(c):
 

From a1f0a5e2d36969e6ffcf67256e9194b058513efe Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 22:08:25 +0200
Subject: [PATCH 006/176] replace uniseg with uniseg2

---
 qurator/dinglehopper/character_error_rate.py |  2 +-
 qurator/dinglehopper/cli.py                  | 18 ++++++++++++++++++
 qurator/dinglehopper/edit_distance.py        |  2 +-
 qurator/dinglehopper/extracted_text.py       |  2 +-
 qurator/dinglehopper/ocr_files.py            |  2 +-
 qurator/dinglehopper/word_error_rate.py      | 14 +++++++-------
 requirements.txt                             |  2 +-
 7 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 3b8c0cc..68accae 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -2,7 +2,7 @@ import unicodedata
 from typing import Tuple
 
 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 
 from .edit_distance import distance
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 3c52c5d..7b74b78 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -175,6 +175,24 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
     By default, the text of PAGE files is extracted on 'region' level. You may
     use "--textequiv-level line" to extract from the level of TextLine tags.
     """
+    import cProfile
+    import pstats
+    import io
+    import atexit
+
+    #print("Profiling...")
+    #pr = cProfile.Profile()
+    #pr.enable()
+
+    def exit():
+        pr.disable()
+        print("Profiling completed")
+        s = io.StringIO()
+        pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats()
+        print(s.getvalue())
+
+    #atexit.register(exit)
+
     initLogging()
     Config.progress = progress
     process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 2120b80..d89eb8c 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -1,7 +1,7 @@
 import unicodedata
 
 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
 
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 19ad9c1..ebb9631 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -9,7 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 
 
 class Normalization(enum.Enum):
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 6384dfa..29101cd 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -4,7 +4,7 @@ from typing import Iterator
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 3b9ff5e..ccfc64a 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -2,24 +2,24 @@ import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod
 
-import uniseg.wordbreak
+import uniseg2.wordbreak
 
 from rapidfuzz.distance import Levenshtein
 from . import ExtractedText
 
 
-# Did we patch uniseg.wordbreak.word_break already?
+# Did we patch uniseg2.wordbreak.word_break already?
 word_break_patched = False
 
 
 def patch_word_break():
     """
-    Patch uniseg.wordbreak.word_break to deal with our private use characters.
+    Patch uniseg2.wordbreak.word_break to deal with our private use characters.
 
     See also
     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
     """
-    old_word_break = uniseg.wordbreak.word_break
+    old_word_break = uniseg2.wordbreak.word_break
 
     def new_word_break(c, index=0):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
@@ -27,7 +27,7 @@ def patch_word_break():
         else:
             return old_word_break(c, index)
 
-    uniseg.wordbreak.word_break = new_word_break
+    uniseg2.wordbreak.word_break = new_word_break
     global word_break_patched
     word_break_patched = True
 
@@ -53,8 +53,8 @@ def words(s: str):
         return cat in unwanted_categories or subcat in unwanted_subcategories
 
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
-    for word in uniseg.wordbreak.words(s):
+    # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    for word in uniseg2.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass
         else:
diff --git a/requirements.txt b/requirements.txt
index daf2b0f..3c7a257 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg
+uniseg2
 numpy
 colorama
 MarkupSafe

From d2bbc8a6c7d150d5bdd95cc139112eacf963dc78 Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Sun, 11 Sep 2022 02:38:32 +0200
Subject: [PATCH 007/176] update rapidfuzz version

---
 qurator/dinglehopper/align.py         |  5 ++---
 qurator/dinglehopper/cli.py           | 27 +++++----------------------
 qurator/dinglehopper/cli_line_dirs.py |  5 +++--
 requirements.txt                      |  2 +-
 4 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
index cc96891..968d931 100644
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@@ -1,7 +1,6 @@
 from .edit_distance import *
 from rapidfuzz.distance import Levenshtein
 
-
 def align(t1, t2):
     """Align text."""
     s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
@@ -9,11 +8,11 @@ def align(t1, t2):
     return seq_align(s1, s2)
 
 
-def seq_align(s1, s2):
+def seq_align(s1, s2, score_hint=None):
     """Align general sequences."""
     s1 = list(s1)
     s2 = list(s2)
-    ops = Levenshtein.editops(s1, s2)
+    ops = Levenshtein.editops(s1, s2, score_hint=score_hint)
     i = 0
     j = 0
 
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 7b74b78..ef101a4 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -4,6 +4,7 @@ import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from ocrd_utils import initLogging
+from math import ceil
 
 from .character_error_rate import character_error_rate_n
 from .word_error_rate import word_error_rate_n, words_normalized
@@ -13,7 +14,7 @@ from .ocr_files import extract
 from .config import Config
 
 
-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
     gtx = ""
     ocrx = ""
 
@@ -52,7 +53,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
 
     g_pos = 0
     o_pos = 0
-    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
         css_classes = None
         gt_id = None
         ocr_id = None
@@ -109,12 +110,12 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
 
     cer, n_characters = character_error_rate_n(gt_text, ocr_text)
     char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+        gt_text, ocr_text, css_prefix="c", joiner="", none="·", score_hint=int(ceil(cer * n_characters))
     )
 
     wer, n_words = word_error_rate_n(gt_words, ocr_words)
     word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯", score_hint=int(ceil(wer * n_words))
     )
 
     env = Environment(
@@ -175,24 +176,6 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
     By default, the text of PAGE files is extracted on 'region' level. You may
     use "--textequiv-level line" to extract from the level of TextLine tags.
     """
-    import cProfile
-    import pstats
-    import io
-    import atexit
-
-    #print("Profiling...")
-    #pr = cProfile.Profile()
-    #pr.enable()
-
-    def exit():
-        pr.disable()
-        print("Profiling completed")
-        s = io.StringIO()
-        pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats()
-        print(s.getvalue())
-
-    #atexit.register(exit)
-
     initLogging()
     Config.progress = progress
     process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 3f8e3fc..06bbe39 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -4,6 +4,7 @@ import itertools
 import click
 from jinja2 import Environment, FileSystemLoader
 from ocrd_utils import initLogging
+from math import ceil
 
 from .character_error_rate import character_error_rate_n
 from .word_error_rate import word_error_rate_n, words_normalized
@@ -74,10 +75,10 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 
         # Generate diff reports
         char_diff_report += gen_diff_report(
-            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
+            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·", score_hint=int(ceil(l_cer * l_n_characters))
         )
         word_diff_report += gen_diff_report(
-            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯"
+            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯", score_hint=int(ceil(l_wer * l_n_words))
         )
 
     env = Environment(
diff --git a/requirements.txt b/requirements.txt
index 3c7a257..0389f61 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,5 +9,5 @@ ocrd >= 2.20.1
 attrs
 multimethod == 1.3  # latest version to officially support Python 3.5
 tqdm
-rapidfuzz >= 2.4.2
+rapidfuzz >= 2.7.0
 six  # XXX workaround OCR-D/core#730

From f48e305347ff9ae9fec3641d8bd4101562dda3a3 Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Wed, 12 Oct 2022 18:52:58 +0200
Subject: [PATCH 008/176] use uniseg again

---
 qurator/dinglehopper/character_error_rate.py |  2 +-
 qurator/dinglehopper/edit_distance.py        |  2 +-
 qurator/dinglehopper/extracted_text.py       |  2 +-
 qurator/dinglehopper/ocr_files.py            |  2 +-
 qurator/dinglehopper/word_error_rate.py      | 14 +++++++-------
 requirements.txt                             |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 68accae..3b8c0cc 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -2,7 +2,7 @@ import unicodedata
 from typing import Tuple
 
 from multimethod import multimethod
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 from .edit_distance import distance
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index d89eb8c..2120b80 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -1,7 +1,7 @@
 import unicodedata
 
 from multimethod import multimethod
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
 
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index ebb9631..19ad9c1 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -9,7 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 
 class Normalization(enum.Enum):
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 29101cd..6384dfa 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -4,7 +4,7 @@ from typing import Iterator
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index ccfc64a..3b9ff5e 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -2,24 +2,24 @@ import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod
 
-import uniseg2.wordbreak
+import uniseg.wordbreak
 
 from rapidfuzz.distance import Levenshtein
 from . import ExtractedText
 
 
-# Did we patch uniseg2.wordbreak.word_break already?
+# Did we patch uniseg.wordbreak.word_break already?
 word_break_patched = False
 
 
 def patch_word_break():
     """
-    Patch uniseg2.wordbreak.word_break to deal with our private use characters.
+    Patch uniseg.wordbreak.word_break to deal with our private use characters.
 
     See also
     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
     """
-    old_word_break = uniseg2.wordbreak.word_break
+    old_word_break = uniseg.wordbreak.word_break
 
     def new_word_break(c, index=0):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
@@ -27,7 +27,7 @@ def patch_word_break():
         else:
             return old_word_break(c, index)
 
-    uniseg2.wordbreak.word_break = new_word_break
+    uniseg.wordbreak.word_break = new_word_break
     global word_break_patched
     word_break_patched = True
 
@@ -53,8 +53,8 @@ def words(s: str):
         return cat in unwanted_categories or subcat in unwanted_subcategories
 
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
-    for word in uniseg2.wordbreak.words(s):
+    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    for word in uniseg.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass
         else:
diff --git a/requirements.txt b/requirements.txt
index 0389f61..11d1dcf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg2
+uniseg
 numpy
 colorama
 MarkupSafe

From a18b25b1633d71b1019e3952eee26c78ba6c2d12 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 27 Jan 2023 19:13:45 +0100
Subject: [PATCH 009/176] =?UTF-8?q?=F0=9F=90=9B=20Update=20tests=20for=20E?=
 =?UTF-8?q?xtractedText?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In PR gh-72, @maxbachmann introduced a new argument for ExtractedText(). Update the
corresponding tests.
---
 .../dinglehopper/tests/extracted_text_test.py | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py
index 8a81587..bc230d6 100644
--- a/qurator/dinglehopper/tests/extracted_text_test.py
+++ b/qurator/dinglehopper/tests/extracted_text_test.py
@@ -13,12 +13,13 @@ def test_text():
     test1 = ExtractedText(
         None,
         [
-            ExtractedText("s0", None, None, "foo"),
-            ExtractedText("s1", None, None, "bar"),
-            ExtractedText("s2", None, None, "bazinga"),
+            ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
+            ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
+            ExtractedText("s2", None, None, "bazinga", grapheme_clusters("bazinga")),
         ],
         " ",
         None,
+        None,
     )
 
     assert test1.text == "foo bar bazinga"
@@ -29,8 +30,12 @@ def test_text():
 
 def test_normalization_check():
     with pytest.raises(ValueError, match=r".*is not in NFC.*"):
-        ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
-    assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
+        ExtractedText("foo", None, None,
+                      unicodedata.normalize("NFD", "Schlyñ"),
+                      grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")))
+    assert ExtractedText("foo", None, None,
+                         unicodedata.normalize("NFC", "Schlyñ"),
+                         grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")))
 
 
 AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
@@ -47,25 +52,27 @@ def test_align():
     test1 = ExtractedText(
         None,
         [
-            ExtractedText("s0", None, None, "foo"),
-            ExtractedText("s1", None, None, "bar"),
-            ExtractedText("s2", None, None, "batzinga"),
+            ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
+            ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
+            ExtractedText("s2", None, None, "batzinga", grapheme_clusters("batzinga")),
         ],
         " ",
         None,
+        None,
     )
     test2 = ExtractedText(
         None,
         [
-            ExtractedText("x0", None, None, "foo"),
-            ExtractedText("x1", None, None, "bar"),
+            ExtractedText("x0", None, None, "foo", grapheme_clusters("foo")),
+            ExtractedText("x1", None, None, "bar", grapheme_clusters("bar")),
             # extra .
-            ExtractedText("x2", None, None, "."),
+            ExtractedText("x2", None, None, ".", grapheme_clusters(".")),
             # deletion + different grapheme cluster, m̃ also is two Python characters
-            ExtractedText("x3", None, None, "bazim̃ga"),
+            ExtractedText("x3", None, None, "bazim̃ga", grapheme_clusters("bazim̃ga")),
         ],
         " ",
         None,
+        None,
     )
 
     left_pos = 0

From 563642c93b21e796755d9415fb1bc0fc3836c0ad Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 4 Aug 2023 20:30:50 +0200
Subject: [PATCH 010/176] =?UTF-8?q?=F0=9F=90=9B=20Workaround=20sdist=20not?=
 =?UTF-8?q?=20containing=20top-level=20ocrd-tool.json?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/qurator-spk/setuptools_ocrd/issues/10 - The sdist does not
contain ocrd-tool.json, so that the wheel built from it does not get the proper version.
Needs to be fixed in setuptools_ocrd, then MANIFEST.in can be removed again.
---
 MANIFEST.in | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..5f4b37e
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+# FIXME: Workaround https://github.com/qurator-spk/setuptools_ocrd/issues/10
+include ocrd-tool.json

From 668072e338d08e0d7e8e56e7ea213e1bd2bf8696 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 4 Aug 2023 20:34:35 +0200
Subject: [PATCH 011/176] =?UTF-8?q?=F0=9F=A7=B9=20.gitignore=20dist/?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2291cd6..d931831 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ dmypy.json
 
 # Build artifacts
 /build
+/dist

From 98a67c7b3b7328201fafa82bc2d36e361490029c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 4 Aug 2023 20:35:42 +0200
Subject: [PATCH 012/176] =?UTF-8?q?=F0=9F=93=A6=20v0.9.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index c4f8c4e..33a709f 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.1",
+  "version": "0.9.2",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {

From 12b1ea3ae7a92f6b2f13518822113880b9b5a04a Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 19:52:12 +0200
Subject: [PATCH 013/176] =?UTF-8?q?=F0=9F=90=9B=20Remove=20MANIFEST.in=20w?=
 =?UTF-8?q?orkaround,=20now=20that=20setuptools=5Focrd=20is=20fixed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MANIFEST.in | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 5f4b37e..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-# FIXME: Workaround https://github.com/qurator-spk/setuptools_ocrd/issues/10
-include ocrd-tool.json

From 6c70afbbc577e9bb31641dad6e281af9562964af Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 19:53:04 +0200
Subject: [PATCH 014/176] =?UTF-8?q?=F0=9F=93=A6=20v0.9.3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 33a709f..2b08ace 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.2",
+  "version": "0.9.3",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {

From de70b198acb1a74b4d09306866362838b7aa1922 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 20:04:02 +0200
Subject: [PATCH 015/176] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20empty=20setup.c?=
 =?UTF-8?q?fg?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 setup.cfg | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 setup.cfg

diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index e69de29..0000000

From 9594b4c9d2596d66fac5fe6a56b84c3763b58a36 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 20:04:35 +0200
Subject: [PATCH 016/176] =?UTF-8?q?=F0=9F=A7=B9=20pyproject:=20Remove=20ex?=
 =?UTF-8?q?tra=20*.json?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2e98ae1..da33b15 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,7 +48,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
 where = ["src"]
 
 [tool.setuptools.package-data]
-dinglehopper = ["*.json", "templates/*"]
+dinglehopper = ["templates/*"]
 
 
 [tool.pytest.ini_options]

From 1e7c46285be455f3f87da4c47b02b8d6a2309c83 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 20:06:49 +0200
Subject: [PATCH 017/176] =?UTF-8?q?=F0=9F=8E=A8=20editorconfig:=20*.json?=
 =?UTF-8?q?=20should=20have=20a=20final=20newline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .editorconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.editorconfig b/.editorconfig
index ea42d71..6959d70 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -15,7 +15,7 @@ indent_size = 2
 
 [*.json]
 indent_size = 2
-insert_final_newline = false
+insert_final_newline = true
 
 # trailing spaces in markdown indicate word wrap
 [*.md]

From a1a7f95ac6156c42403708d431cc66cb34118a5f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 20:07:06 +0200
Subject: [PATCH 018/176] =?UTF-8?q?=F0=9F=93=A6=20v0.9.4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 2b08ace..a71ce37 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.3",
+  "version": "0.9.4",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {

From 54a31211725663bba5af7e04f93db69764211b7d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 20:28:01 +0200
Subject: [PATCH 019/176] =?UTF-8?q?=E2=9C=92=20README:=20Recommend=20insta?=
 =?UTF-8?q?lling=20via=20pip=20and=20from=20PyPI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index affcfe8..3a0b56e 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report. It also supports batch processing by 
+metrics and a word/character differences report. It also supports batch processing by
 generating, aggregating and summarizing multiple reports.
 
 [![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
@@ -23,10 +23,11 @@ Goals
 
 Installation
 ------------
-It's best to use pip, e.g.:
-~~~
-sudo pip install .
-~~~
+
+It's best to use pip to install the package from PyPI, e.g.:
+```
+pip install dinglehopper
+```
 
 Usage
 -----

From dbaccdd5e327f7c86e586e61a3f7b2612cbe99e5 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 11 Aug 2023 20:28:29 +0200
Subject: [PATCH 020/176] =?UTF-8?q?=E2=9C=92=20README:=20Minor=20whitespac?=
 =?UTF-8?q?e=20cleanup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 3a0b56e..00fd899 100644
--- a/README.md
+++ b/README.md
@@ -70,19 +70,19 @@ This generates `report.html` and `report.json`.
 
 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
 
-Batch comparison between folders of GT and OCR files can be done by simply providing 
+Batch comparison between folders of GT and OCR files can be done by simply providing
 folders:
 ~~~
 dinglehopper gt/ ocr/ report output_folder/
 ~~~
-This assumes that you have files with the same name in both folders, e.g. 
+This assumes that you have files with the same name in both folders, e.g.
 `gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
 
-The example generates reports for each set of files, with the prefix `report`, in the 
+The example generates reports for each set of files, with the prefix `report`, in the
 (automatically created) folder `output_folder/`.
 
-By default, the JSON report does not contain the character and word differences, only 
-the calculated metrics. If you want to include the differences, use the 
+By default, the JSON report does not contain the character and word differences, only
+the calculated metrics. If you want to include the differences, use the
 `--differences` flag:
 
 ~~~
@@ -90,7 +90,7 @@ dinglehopper gt/ ocr/ report output_folder/ --differences
 ~~~
 
 ### dinglehopper-summarize
-A set of (JSON) reports can be summarized into a single set of 
+A set of (JSON) reports can be summarized into a single set of
 reports. This is useful after having generated reports in batch.
 Example:
 ~~~
@@ -100,7 +100,7 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.
 
 If you are summarizing many reports and have used the `--differences` flag while
 generating them, it may be useful to limit the number of differences reported by using
-the `--occurences-threshold` parameter. This will reduce the size of the generated HTML 
+the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
 report, making it easier to open and navigate. Note that the JSON report will still
 contain all differences. Example:
 ~~~

From 9d862e418b1f20c560cf084af3df4764c328b41e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 14:03:07 +0200
Subject: [PATCH 021/176] =?UTF-8?q?=E2=9C=94=20Add=20mets:FLocat's=20@LOCT?=
 =?UTF-8?q?YPE/OTHERLOCTYPE=20to=20test=20data?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the
test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so
adding it. This fixes the test again.
---
 src/dinglehopper/tests/data/actevedef_718448162/mets.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/tests/data/actevedef_718448162/mets.xml b/src/dinglehopper/tests/data/actevedef_718448162/mets.xml
index a6804ca..ed7c4f4 100644
--- a/src/dinglehopper/tests/data/actevedef_718448162/mets.xml
+++ b/src/dinglehopper/tests/data/actevedef_718448162/mets.xml
@@ -138,17 +138,17 @@
   <mets:fileSec>
     <mets:fileGrp USE="OCR-D-GT-PAGE">
       <mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
       </mets:file>
     </mets:fileGrp>
     <mets:fileGrp USE="OCR-D-OCR-CALAMARI">
       <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
       </mets:file>
     </mets:fileGrp>
     <mets:fileGrp USE="OCR-D-OCR-TESS">
       <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
       </mets:file>
     </mets:fileGrp>
   </mets:fileSec>

From 5450f193e42ba9f9dc95871ad6c1eb4b436c345a Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 14:08:14 +0200
Subject: [PATCH 022/176] =?UTF-8?q?=E2=9C=94=20GitHub=20Actions:=20Test=20?=
 =?UTF-8?q?on=20Python=203.12?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7d55459..ab7233e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     # For Python 3.6, we need to fall back to Ubuntu 20.04
     runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}

From 1b7c2a61a38a9371f78cf2f21bd6b4dd2866c827 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 14:34:33 +0200
Subject: [PATCH 023/176] =?UTF-8?q?=E2=9C=94=20Remove=20CircleCI=20config?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .circleci/config.yml | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 7aecdd0..0000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-version: 2.1
-
-jobs:
-  black:
-    parameters:
-      python-version:
-        type: string
-    docker:
-      - image: cimg/python:<< parameters.python-version >>
-    steps:
-      - checkout
-      - run: pip3 install --upgrade pip
-      - run: pip3 install black
-      - run: black .
-
-workflows:
-  black:
-    jobs:
-      - black:
-          python-version: "3.11"

From 061ba16461aa42d6ae10b7855169f8eb01ca9dba Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:07:22 +0200
Subject: [PATCH 024/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dd7b710..8a6391d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+    rev: v4.5.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
@@ -13,17 +13,19 @@ repos:
     -   id: check-ast
 
 -   repo: https://github.com/psf/black
-    rev: 22.10.0
+    rev: 23.10.0
     hooks:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.280
+    rev: v0.1.1
     hooks:
     -   id: ruff
         args: [--fix, --exit-non-zero-on-fix]
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.6.1
     hooks:
-    -   id: mypy
+    -   additional_dependencies:
+        - types-setuptools
+        id: mypy

From 4e0d4dcf09329ef71af319b2405d8e35461518c3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:08:16 +0200
Subject: [PATCH 025/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Add=20pre-com?=
 =?UTF-8?q?mit-update=20hook=20(to=20update=20hooks=20using=20pre-commit)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8a6391d..d0ae66d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,3 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.5.0
@@ -20,8 +18,10 @@ repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.1
     hooks:
-    -   id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
+    -   args:
+        - --fix
+        - --exit-non-zero-on-fix
+        id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.6.1
@@ -29,3 +29,8 @@ repos:
     -   additional_dependencies:
         - types-setuptools
         id: mypy
+
+-   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
+    rev: v0.1.0
+    hooks:
+    -   id: pre-commit-update

From 8a1ea4ec93d22539c63ec1f73c8efd32e0be4bbf Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:15:00 +0200
Subject: [PATCH 026/176] =?UTF-8?q?=F0=9F=8E=A8=20Add=20newlines=20at=20en?=
 =?UTF-8?q?d=20of=20files=20(ruff)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/tests/data/test.alto1.xml | 2 +-
 src/dinglehopper/tests/data/test.alto2.xml | 2 +-
 src/dinglehopper/tests/data/test.txt       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/tests/data/test.alto1.xml b/src/dinglehopper/tests/data/test.alto1.xml
index ac2a50b..35aa19a 100644
--- a/src/dinglehopper/tests/data/test.alto1.xml
+++ b/src/dinglehopper/tests/data/test.alto1.xml
@@ -20183,4 +20183,4 @@
             </PrintSpace>
         </Page>
     </Layout>
-</alto>
\ No newline at end of file
+</alto>
diff --git a/src/dinglehopper/tests/data/test.alto2.xml b/src/dinglehopper/tests/data/test.alto2.xml
index 67d3537..39dd592 100644
--- a/src/dinglehopper/tests/data/test.alto2.xml
+++ b/src/dinglehopper/tests/data/test.alto2.xml
@@ -61,4 +61,4 @@
 </PrintSpace>
 </Page>
 </Layout>
-</alto>
\ No newline at end of file
+</alto>
diff --git a/src/dinglehopper/tests/data/test.txt b/src/dinglehopper/tests/data/test.txt
index 41bfe81..102374b 100644
--- a/src/dinglehopper/tests/data/test.txt
+++ b/src/dinglehopper/tests/data/test.txt
@@ -1 +1 @@
-Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
\ No newline at end of file
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.

From fe60361e8d9ef504df25fa229a215c6d19840a57 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:17:06 +0200
Subject: [PATCH 027/176] =?UTF-8?q?=E2=9C=92=20README-DEV:=20Make=20pre-co?=
 =?UTF-8?q?mmit=20section=20top-level=20(+=20small=20whitespace=20fix)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README-DEV.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README-DEV.md b/README-DEV.md
index cdd51fd..a3441b1 100644
--- a/README-DEV.md
+++ b/README-DEV.md
@@ -10,6 +10,7 @@ pytest
 ```
 
 ## Test running examples
+
 Only unit tests:
 ```bash
 pytest -m "not integration"
@@ -36,7 +37,7 @@ pytest -k "not test" --mypy
 pytest -k "not test" --ruff
 ```
 
-## How to use pre-commit
+# How to use pre-commit
 
 This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
 

From 1c3b28d873099c9e8ea4d7dda47b6392f5436f55 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:26:10 +0200
Subject: [PATCH 028/176] =?UTF-8?q?=E2=AC=86=20Update=20multimethod=20depe?=
 =?UTF-8?q?ndency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore,
so lifting the hard pin on multimethod 1.3.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8ee3d1d..fdb3dde 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ colorama
 MarkupSafe
 ocrd >= 2.20.1
 attrs
-multimethod == 1.3  # latest version to officially support Python 3.5
+multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.4.2
 six  # XXX workaround OCR-D/core#730

From e7e0703d9d8f37923b2022e0615b8bcbf199ac75 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:45:20 +0200
Subject: [PATCH 029/176] =?UTF-8?q?=E2=9C=94=20GitHub=20Actions:=20Test=20?=
 =?UTF-8?q?on=20PR?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ab7233e..d5b360b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -6,6 +6,10 @@ on:
     branches:
       - master
 
+  pull_requests:
+    branches:
+      - master
+
   schedule:
     - cron: "00 16 07 * *"  # = monthly
 

From d8f84ec9ac6b77a241748b1b776a73d13fcb3516 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:53:14 +0200
Subject: [PATCH 030/176] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20old=20six=20dep?=
 =?UTF-8?q?endency=20(workaround=20for=20OCR-D/core#730)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index fdb3dde..cdf0219 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,5 +10,4 @@ attrs
 multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.4.2
-six  # XXX workaround OCR-D/core#730
 chardet

From 3f8c8e69aa1ade0b716afd455e26f870da942842 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 16:07:44 +0200
Subject: [PATCH 031/176] =?UTF-8?q?=F0=9F=90=9B=20(Hopefully)=20Fix=20runn?=
 =?UTF-8?q?ing=20tests=20on=20PR?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d5b360b..61dc014 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -6,7 +6,7 @@ on:
     branches:
       - master
 
-  pull_requests:
+  pull_request:
     branches:
       - master
 

From f077ce2e1b34c9d7cbf0b5bad6d05d2593d0b577 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 25 May 2023 18:36:46 +0200
Subject: [PATCH 032/176] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper-summarize:?=
 =?UTF-8?q?=20Handle=20reports=20without=20difference=20stats?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_summarize.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/dinglehopper/cli_summarize.py b/src/dinglehopper/cli_summarize.py
index 0422759..651f367 100644
--- a/src/dinglehopper/cli_summarize.py
+++ b/src/dinglehopper/cli_summarize.py
@@ -34,10 +34,13 @@ def process(reports_folder, occurrences_threshold=1):
                 cer_sum += cer
                 wer_sum += wer
 
-                for key, value in report_data["differences"]["character_level"].items():
-                    diff_c[key] = diff_c.get(key, 0) + value
-                for key, value in report_data["differences"]["word_level"].items():
-                    diff_w[key] = diff_w.get(key, 0) + value
+                try:
+                    for key, value in report_data["differences"]["character_level"].items():
+                        diff_c[key] = diff_c.get(key, 0) + value
+                    for key, value in report_data["differences"]["word_level"].items():
+                        diff_w[key] = diff_w.get(key, 0) + value
+                except KeyError:
+                    pass
 
     if len(cer_list) == 0:
         click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")

From 7ed076d3c1693bab4219fc956e1d849343b65e37 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 15:26:10 +0200
Subject: [PATCH 033/176] =?UTF-8?q?=E2=AC=86=20Update=20multimethod=20depe?=
 =?UTF-8?q?ndency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore,
so lifting the hard pin on multimethod 1.3.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 11d1dcf..7a2c39d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ colorama
 MarkupSafe
 ocrd >= 2.20.1
 attrs
-multimethod == 1.3  # latest version to officially support Python 3.5
+multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.7.0
 six  # XXX workaround OCR-D/core#730

From 7fef02bf0aa5d7a5abd86bc58f141a9856a795af Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 23 Oct 2023 14:03:07 +0200
Subject: [PATCH 034/176] =?UTF-8?q?=E2=9C=94=20Add=20mets:FLocat's=20@LOCT?=
 =?UTF-8?q?YPE/OTHERLOCTYPE=20to=20test=20data?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the
test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so
adding it. This fixes the test again.
---
 .../dinglehopper/tests/data/actevedef_718448162/mets.xml    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
index a6804ca..ed7c4f4 100644
--- a/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
+++ b/qurator/dinglehopper/tests/data/actevedef_718448162/mets.xml
@@ -138,17 +138,17 @@
   <mets:fileSec>
     <mets:fileGrp USE="OCR-D-GT-PAGE">
       <mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
       </mets:file>
     </mets:fileGrp>
     <mets:fileGrp USE="OCR-D-OCR-CALAMARI">
       <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
       </mets:file>
     </mets:fileGrp>
     <mets:fileGrp USE="OCR-D-OCR-TESS">
       <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
-        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/>
+        <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
       </mets:file>
     </mets:fileGrp>
   </mets:fileSec>

From bc95c0312737ce37d06b90d27af3078902524030 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 14 Mar 2023 13:16:09 +0100
Subject: [PATCH 035/176] =?UTF-8?q?=F0=9F=95=B8Do=20not=20use=20deprecated?=
 =?UTF-8?q?=20ID,=20pageId=20options?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See gh-75.
---
 qurator/dinglehopper/ocrd_cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py
index 9578a0a..c5f79cd 100644
--- a/qurator/dinglehopper/ocrd_cli.py
+++ b/qurator/dinglehopper/ocrd_cli.py
@@ -66,9 +66,9 @@ class OcrdDinglehopperEvaluate(Processor):
                 [".json", "application/json"],
             ]:
                 self.workspace.add_file(
-                    ID=file_id + report_suffix,
+                    file_id=file_id + report_suffix,
                     file_grp=self.output_file_grp,
-                    pageId=page_id,
+                    page_id=page_id,
                     mimetype=mimetype,
                     local_filename=report_prefix + report_suffix,
                 )

From e256526ea1d33e3673eb8bff466d8599277928ad Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 27 Oct 2023 20:55:37 +0200
Subject: [PATCH 036/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20calculation=20of?=
 =?UTF-8?q?=20score=5Fhint=20for=20edge=20cases,=20e.g.=20when=20CER=20is?=
 =?UTF-8?q?=20infinite?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the CER is infinite, we can't calculate a score_hint as an int. Fall back to None
in this case.
---
 qurator/dinglehopper/align.py            | 19 +++++++++++++++++++
 qurator/dinglehopper/cli.py              |  8 +++++---
 qurator/dinglehopper/cli_line_dirs.py    |  6 ++++--
 qurator/dinglehopper/tests/test_align.py |  7 ++++++-
 4 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
index 968d931..07cbc8f 100644
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@@ -1,3 +1,6 @@
+import math
+from math import ceil
+
 from .edit_distance import *
 from rapidfuzz.distance import Levenshtein
 
@@ -8,6 +11,22 @@ def align(t1, t2):
     return seq_align(s1, s2)
 
 
+def score_hint(er: float, n: int) -> int | None:
+    """Calculate RapidFuzz score hint for a given error rate and count.
+
+    Gives the score hint for the distance functions (= expected distance) or None if
+    the error rate is inf.
+    """
+    assert not math.isnan(er)
+    try:
+        score_hint = int(ceil(er * n))
+    except (OverflowError, ValueError):
+        # ceil(er * n) can be inf or NaN (for n == 0), so int() can throw an
+        # OverflowError and a ValueError.
+        score_hint = None
+    return score_hint
+
+
 def seq_align(s1, s2, score_hint=None):
     """Align general sequences."""
     s1 = list(s1)
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index ef101a4..4d4349c 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -8,7 +8,7 @@ from math import ceil
 
 from .character_error_rate import character_error_rate_n
 from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
+from .align import seq_align, score_hint
 from .extracted_text import ExtractedText
 from .ocr_files import extract
 from .config import Config
@@ -110,12 +110,14 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
 
     cer, n_characters = character_error_rate_n(gt_text, ocr_text)
     char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·", score_hint=int(ceil(cer * n_characters))
+        gt_text, ocr_text, css_prefix="c", joiner="", none="·",
+        score_hint=score_hint(cer, n_characters)
     )
 
     wer, n_words = word_error_rate_n(gt_words, ocr_words)
     word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯", score_hint=int(ceil(wer * n_words))
+        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
+        score_hint=score_hint(wer, n_words)
     )
 
     env = Environment(
diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 06bbe39..01ba959 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -75,10 +75,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 
         # Generate diff reports
         char_diff_report += gen_diff_report(
-            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·", score_hint=int(ceil(l_cer * l_n_characters))
+            gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
+            score_hint=score_hint(l_cer, l_n_characters)
         )
         word_diff_report += gen_diff_report(
-            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯", score_hint=int(ceil(l_wer * l_n_words))
+            gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
+            score_hint=score_hint(l_wer, l_n_words))
         )
 
     env = Environment(
diff --git a/qurator/dinglehopper/tests/test_align.py b/qurator/dinglehopper/tests/test_align.py
index 96fc3c2..8e254e6 100644
--- a/qurator/dinglehopper/tests/test_align.py
+++ b/qurator/dinglehopper/tests/test_align.py
@@ -1,6 +1,7 @@
+import math
 import pytest
 from .util import unzip
-from .. import align, seq_align, distance
+from .. import align, seq_align, distance, score_hint
 
 
 def test_left_empty():
@@ -181,3 +182,7 @@ def test_lines_similar():
 
     # Test __eq__ (i.e. is it a substitution or a similar string?)
     assert list(left)[0] == list(right)[0]
+
+def test_score_hint():
+    assert score_hint(0.5, 23) == 12  # int(ceil())
+    assert score_hint(math.inf, 12345) is None

From 618ea567deb40414e4d8fc7cc8b8052aa4ae20dc Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 31 Oct 2023 19:08:25 +0100
Subject: [PATCH 037/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20docstring=20of=20d?=
 =?UTF-8?q?istance()=20for=20grapheme=20clusters?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/edit_distance.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 2120b80..32ef354 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -9,11 +9,11 @@ from .extracted_text import ExtractedText
 
 @multimethod
 def distance(seq1: list[str], seq2: list[str]):
-    """Compute the Levenshtein edit distance between two Unicode strings
+    """Compute the Levenshtein edit distance between two lists of grapheme clusters.
 
-    Note that this is different from levenshtein() as this function knows about Unicode
-    normalization and grapheme clusters. This should be the correct way to compare two
-    Unicode strings.
+    This assumes that the grapheme clusters are already normalized.
+
+    Use distance(str, str) instead if you need to compare two Unicode strings.
     """
     return Levenshtein.distance(seq1, seq2)
 

From 7c6ee593f0c6f0bec73d783c096ce8a31b9a5405 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 31 Oct 2023 19:13:19 +0100
Subject: [PATCH 038/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20score=5Fhint=20cal?=
 =?UTF-8?q?l=20in=20cli=5Fline=5Fdirs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/cli_line_dirs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py
index 01ba959..00478cb 100644
--- a/qurator/dinglehopper/cli_line_dirs.py
+++ b/qurator/dinglehopper/cli_line_dirs.py
@@ -80,7 +80,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
         )
         word_diff_report += gen_diff_report(
             gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
-            score_hint=score_hint(l_wer, l_n_words))
+            score_hint=score_hint(l_wer, l_n_words)
         )
 
     env = Environment(

From de6cd8f1e7b97c27a9aeca878797d0491d8f1872 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 31 Oct 2023 20:40:27 +0100
Subject: [PATCH 039/176] =?UTF-8?q?=E2=9D=8E=20Make=20joining=20grapheme?=
 =?UTF-8?q?=20clusters=20more=20robust=20by=20checking=20joiner=20and=20ha?=
 =?UTF-8?q?ndling=20an=20empty=20joiner?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/dinglehopper/extracted_text.py | 34 +++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 19ad9c1..28678e4 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -1,4 +1,5 @@
 import enum
+import functools
 import re
 import unicodedata
 from contextlib import suppress
@@ -141,6 +142,15 @@ class ExtractedText:
         if value is not None and self._text is not None:
             raise ValueError("Can't have both segments and text")
 
+    @joiner.validator
+    def check(self, _, value):
+        if self.segments is None:
+            if value is not None:
+                raise ValueError("Can't have joiner without segments to join")
+        if self.segments is not None:
+            if value not in ("", " ", "\n"):
+                raise ValueError(f"Unexcepted segment joiner value {repr(value)}")
+
     @_text.validator
     def check(self, _, value):
         if value is None:
@@ -169,16 +179,34 @@ class ExtractedText:
         else:
             return self.joiner.join(s.text for s in self.segments)
 
+    @functools.cached_property
+    def _joiner_grapheme_cluster(self):
+        """We need the joiner as a list of 0 or 1 grapheme clusters.
+
+        This property is cached.
+        """
+
+        if len(self.joiner) > 0:
+            joiner_grapheme_cluster = list(grapheme_clusters(self.joiner))
+            assert len(joiner_grapheme_cluster) == 1  # see joiner's check above
+        elif len(self.joiner) == 0:
+            joiner_grapheme_cluster = []
+        else:
+            joiner_grapheme_cluster = None
+
+        return joiner_grapheme_cluster
+
     @property
     def grapheme_clusters(self):
         if self._text is not None:
             return self._grapheme_clusters
         else:
+            # TODO Test with text extracted at glyph level (joiner == "")
             clusters = []
             for seg in self.segments:
-                # todo could there be cases where joiner is no grapheme cluster?
-                clusters.extend(seg.grapheme_clusters + [self.joiner])
-            return clusters[:-1]
+                clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster
+            clusters = clusters[:-1]
+            return clusters
 
     _segment_id_for_pos = None
 

From 68a12f8f7f7b82c47e06c8f65a921b7ebc1368c9 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 1 Nov 2023 13:48:07 +0100
Subject: [PATCH 040/176] =?UTF-8?q?=E2=AC=86=20Update=20uniseg=20dependenc?=
 =?UTF-8?q?y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

@maxbachmann also improved the performance of uniseg, and it is in 0.7.2 - update our
dependency.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7a2c39d..9bce7c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg
+uniseg >= 0.7.2
 numpy
 colorama
 MarkupSafe

From b0e906ad00bbb4e8cc2a876aed1c6bae9d861f69 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Thu, 21 Dec 2023 11:55:06 +0330
Subject: [PATCH 041/176] Update Levenshtein.ipynb

Fix a tiny typo in Levenshtein notebook.
---
 src/dinglehopper/notebooks/Levenshtein.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/notebooks/Levenshtein.ipynb b/src/dinglehopper/notebooks/Levenshtein.ipynb
index a27dca4..876bee3 100644
--- a/src/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/src/dinglehopper/notebooks/Levenshtein.ipynb
@@ -22,7 +22,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
+    "dinglehopper used to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
    ]
   },
   {

From 44bd4b5eda29a59f5f02dd8ceb2eef39d21d924d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 2 Jan 2024 20:38:40 +0100
Subject: [PATCH 042/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d0ae66d..4f3562a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,26 +11,26 @@ repos:
     -   id: check-ast
 
 -   repo: https://github.com/psf/black
-    rev: 23.10.0
+    rev: 23.12.1
     hooks:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.1
+    rev: v0.1.10
     hooks:
     -   args:
-        - --fix
-        - --exit-non-zero-on-fix
+        -   --fix
+        -   --exit-non-zero-on-fix
         id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.8.0
     hooks:
     -   additional_dependencies:
-        - types-setuptools
+        -   types-setuptools
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
-    rev: v0.1.0
+    rev: v0.1.1
     hooks:
     -   id: pre-commit-update

From c1681551af19922c8fa4164ea76cccdde832c708 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 3 Jan 2024 19:21:53 +0100
Subject: [PATCH 043/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20generating=20word?=
 =?UTF-8?q?=20differences?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index e542697..99403ba 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -122,9 +122,11 @@ def process(
 
     gt_text = extract(gt, textequiv_level=textequiv_level)
     ocr_text = extract(ocr, textequiv_level=textequiv_level)
-    gt_words = words_normalized(gt_text)
-    ocr_words = words_normalized(ocr_text)
+    gt_words: list = list(words_normalized(gt_text))
+    ocr_words: list = list(words_normalized(ocr_text))
 
+    assert isinstance(gt_text, ExtractedText)
+    assert isinstance(ocr_text, ExtractedText)
     cer, n_characters = character_error_rate_n(gt_text, ocr_text)
     char_diff_report, diff_c = gen_diff_report(
         gt_text,
@@ -136,6 +138,10 @@ def process(
         differences=differences,
     )
 
+    # {gt,ocr}_words must not be a generator, so we don't drain it for the differences
+    # report.
+    assert isinstance(gt_words, list)
+    assert isinstance(ocr_words, list)
     wer, n_words = word_error_rate_n(gt_words, ocr_words)
     word_diff_report, diff_w = gen_diff_report(
         gt_words,

From 4832d1542fbb95cbf9fa19f893fb1fc5400645af Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 3 Jan 2024 20:38:49 +0100
Subject: [PATCH 044/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f3562a..b76b8b6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.10
+    rev: v0.1.11
     hooks:
     -   args:
         -   --fix

From 071766efc2e32d8cd7c9ee0a8633ffdd62513e2e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 3 Jan 2024 20:40:06 +0100
Subject: [PATCH 045/176] =?UTF-8?q?=F0=9F=90=9B=20Use=20Optional=20instead?=
 =?UTF-8?q?=20of=20|=20none,=20for=20Python=20<3.10?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/align.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/dinglehopper/align.py b/src/dinglehopper/align.py
index 1f7957a..c5f12f7 100644
--- a/src/dinglehopper/align.py
+++ b/src/dinglehopper/align.py
@@ -1,6 +1,7 @@
 import math
 import unicodedata
 from math import ceil
+from typing import Optional
 
 from rapidfuzz.distance import Levenshtein
 
@@ -14,7 +15,7 @@ def align(t1, t2):
     return seq_align(s1, s2)
 
 
-def score_hint(er: float, n: int) -> int | None:
+def score_hint(er: float, n: int) -> Optional[int]:
     """Calculate RapidFuzz score hint for a given error rate and count.
 
     Gives the score hint for the distance functions (= expected distance) or None if

From c752793be65bb769fd5f4284182131a49a2beb54 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 3 Jan 2024 20:52:07 +0100
Subject: [PATCH 046/176] =?UTF-8?q?=F0=9F=90=9B=20Use=20typing.List=20inst?=
 =?UTF-8?q?ead=20of=20list,=20for=20Python=20<3.9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/character_error_rate.py | 4 ++--
 src/dinglehopper/edit_distance.py        | 3 ++-
 src/dinglehopper/extracted_text.py       | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/dinglehopper/character_error_rate.py b/src/dinglehopper/character_error_rate.py
index 3b8c0cc..c0e3fe1 100644
--- a/src/dinglehopper/character_error_rate.py
+++ b/src/dinglehopper/character_error_rate.py
@@ -1,5 +1,5 @@
 import unicodedata
-from typing import Tuple
+from typing import Tuple, List
 
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
@@ -10,7 +10,7 @@ from .extracted_text import ExtractedText
 
 @multimethod
 def character_error_rate_n(
-    reference: list[str], compared: list[str]
+    reference: List[str], compared: List[str]
 ) -> Tuple[float, int]:
     """
     Compute character error rate.
diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py
index ef90d81..8eec5e2 100644
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@@ -1,4 +1,5 @@
 import unicodedata
+from typing import List
 
 from multimethod import multimethod
 from rapidfuzz.distance import Levenshtein
@@ -8,7 +9,7 @@ from .extracted_text import ExtractedText
 
 
 @multimethod
-def distance(seq1: list[str], seq2: list[str]):
+def distance(seq1: List[str], seq2: List[str]):
     """Compute the Levenshtein edit distance between two lists of grapheme clusters.
 
     This assumes that the grapheme clusters are already normalized.
diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index 28678e4..7ef9d1d 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -4,7 +4,7 @@ import re
 import unicodedata
 from contextlib import suppress
 from itertools import repeat
-from typing import Optional
+from typing import List, Optional
 
 import attr
 import numpy as np
@@ -135,7 +135,7 @@ class ExtractedText:
     segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
     joiner = attr.ib(type=Optional[str])
     _text = attr.ib(type=Optional[str])
-    _grapheme_clusters = attr.ib(type=Optional[list[str]])
+    _grapheme_clusters = attr.ib(type=Optional[List[str]])
 
     @segments.validator
     def check(self, _, value):

From 7a192880f1a5ea606108baf92e1d8b1a77f1282d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 3 Jan 2024 20:58:24 +0100
Subject: [PATCH 047/176] =?UTF-8?q?=E2=AC=86=20Move=20on=20to=20supporting?=
 =?UTF-8?q?=20Python=20>=3D=203.8=20only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 10 ++--------
 pyproject.toml             |  2 +-
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 61dc014..0f8485a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,10 +25,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
-    # For Python 3.6, we need to fall back to Ubuntu 20.04
-    runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
+    runs-on: "ubuntu-latest"
 
     env:
       test_results_dir: test-results-${{ matrix.python-version }}
@@ -44,11 +43,6 @@ jobs:
 
       - name: Update pip
         run: python3 -m pip install -U pip
-      - name: Avoid compiling OpenCV and NumPy on Python 3.6
-        run: |
-          if python3 --version | grep -q "Python 3.6"; then
-             pip install --prefer-binary -U opencv-python-headless numpy
-          fi
       - name: Install requirements*.txt
         run: |
           for requirements_txt in requirements*.txt; do
diff --git a/pyproject.toml b/pyproject.toml
index da33b15..ce32d56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 ]
 description = "The OCR evaluation tool"
 readme = "README.md"
-requires-python = ">=3.6"
+requires-python = ">=3.8"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
 
 dynamic = ["version", "dependencies", "optional-dependencies"]

From b36727ed9edcf9e3d676f6d255dbf4511955ce2c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Jan 2024 17:43:48 +0100
Subject: [PATCH 048/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b76b8b6..3ea6e96 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,6 @@ repos:
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
-    rev: v0.1.1
+    rev: v0.1.2
     hooks:
     -   id: pre-commit-update

From 4bf123de43db3f7975bb31ce398dfe97ac0692f9 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Jan 2024 17:45:02 +0100
Subject: [PATCH 049/176] =?UTF-8?q?=E2=9A=99=20Update=20ruff+mypy=20depend?=
 =?UTF-8?q?encies?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements-dev.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4bf395e..de6003d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +1,11 @@
 pytest
 pytest-cov
-pytest-mypy
 black
 pre-commit
 
-ruff ; python_version >= "3.7"
-pytest-ruff ; python_version >= "3.7"
+ruff
+pytest-ruff
+
+mypy
+types-setuptools
+pytest-mypy

From bf47308c0065b8cd6fc16d14996fffe06e5d947d Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Tue, 2 Jan 2024 23:07:30 +0330
Subject: [PATCH 050/176] Add report_tests workflow

---
 .github/workflows/test.yml        | 20 ++++----------------
 .github/workflows/test_report.yml | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 16 deletions(-)
 create mode 100644 .github/workflows/test_report.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0f8485a..5c592ff 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,4 +1,4 @@
-name: test
+name: Test
 
 on:
 
@@ -29,9 +29,6 @@ jobs:
 
     runs-on: "ubuntu-latest"
 
-    env:
-      test_results_dir: test-results-${{ matrix.python-version }}
-
     steps:
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -52,19 +49,10 @@ jobs:
       - name: Test
         run: |
             cd src
-            mkdir -p ../$test_results_dir
-            python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
+            python3 -m pytest --junitxml=../junit.xml -o junit_family=legacy
       - name: Upload test results
         uses: actions/upload-artifact@v3
         if: success() || failure()
         with:
-          name: ${{ env.test_results_dir }}
-          path: ${{ env.test_results_dir }}
-
-      - name: Report tests
-        uses: dorny/test-reporter@v1
-        if: success() || failure()
-        with:
-          name: Results on Python ${{ matrix.python-version }}
-          path: "${{env.test_results_dir }}/junit.xml"
-          reporter: java-junit
+          name: test-results-${{matrix.python-version}}
+          path: junit.xml
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
new file mode 100644
index 0000000..f237661
--- /dev/null
+++ b/.github/workflows/test_report.yml
@@ -0,0 +1,20 @@
+name: 'Test Report'
+on:
+  workflow_run:
+    workflows: ['test']
+    types:
+      - completed
+permissions:
+  contents: read
+  actions: read
+  checks: write
+jobs:
+  report:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: dorny/test-reporter@v1
+        with:
+          artifact: /test-results-.*/
+          name: Tests Results
+          path: 'junit.xml'
+          reporter: jest-junit  
\ No newline at end of file

From c90a61c12c9a462da7b89b5b83a32b4ada0824e9 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 3 Jan 2024 22:40:43 +0330
Subject: [PATCH 051/176] Fix a few typos

---
 src/dinglehopper/character_error_rate.py | 2 +-
 src/dinglehopper/edit_distance.py        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/dinglehopper/character_error_rate.py b/src/dinglehopper/character_error_rate.py
index c0e3fe1..5e2e02c 100644
--- a/src/dinglehopper/character_error_rate.py
+++ b/src/dinglehopper/character_error_rate.py
@@ -1,5 +1,5 @@
 import unicodedata
-from typing import Tuple, List
+from typing import List, Tuple
 
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py
index 8eec5e2..af1e047 100644
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@@ -3,6 +3,7 @@ from typing import List
 
 from multimethod import multimethod
 from rapidfuzz.distance import Levenshtein
+from typing import List
 from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText

From 6884c5c82579cebe5faeecd15701dd7c13c949dd Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 3 Jan 2024 23:16:43 +0330
Subject: [PATCH 052/176] Update dorny dependency

---
 .github/workflows/test_report.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index f237661..d1ea24d 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -12,7 +12,7 @@ jobs:
   report:
     runs-on: ubuntu-latest
     steps:
-      - uses: dorny/test-reporter@v1
+      - uses: dorny/test-reporter@v1.7.0
         with:
           artifact: /test-results-.*/
           name: Tests Results

From 4413ddac8f0b67e186cc39c74fcef92c30820b70 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 3 Jan 2024 23:28:33 +0330
Subject: [PATCH 053/176] Temporary commit

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5c592ff..3dba451 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.12" ]
 
     runs-on: "ubuntu-latest"
 

From f4ff6a8f31f56e66c887ea7908f5b2f46030db6a Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 6 Jan 2024 23:39:56 +0330
Subject: [PATCH 054/176] Change reporter

---
 .github/workflows/test.yml        | 2 +-
 .github/workflows/test_report.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3dba451..5c592ff 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.12" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     runs-on: "ubuntu-latest"
 
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index d1ea24d..96b33fa 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -17,4 +17,4 @@ jobs:
           artifact: /test-results-.*/
           name: Tests Results
           path: 'junit.xml'
-          reporter: jest-junit  
\ No newline at end of file
+          reporter: java-junit  
\ No newline at end of file

From 967f833eac5d3b54a79d8ecac8e78df3800eacfb Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 6 Jan 2024 23:50:58 +0330
Subject: [PATCH 055/176] Improve report

---
 .github/workflows/test.yml        | 4 ++--
 .github/workflows/test_report.yml | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5c592ff..9b95bff 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -49,10 +49,10 @@ jobs:
       - name: Test
         run: |
             cd src
-            python3 -m pytest --junitxml=../junit.xml -o junit_family=legacy
+            python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
       - name: Upload test results
         uses: actions/upload-artifact@v3
         if: success() || failure()
         with:
           name: test-results-${{matrix.python-version}}
-          path: junit.xml
+          path: ${{matrix.python-version}}-junit.xml
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index 96b33fa..253429f 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -14,7 +14,7 @@ jobs:
     steps:
       - uses: dorny/test-reporter@v1.7.0
         with:
-          artifact: /test-results-.*/
-          name: Tests Results
-          path: 'junit.xml'
+          artifact: /test-results-(.*)/
+          name: 'Tests Results - $1'
+          path: '*junit.xml'
           reporter: java-junit  
\ No newline at end of file

From 4466422cda9fa655c74c1f21ef434453e18f8cf9 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sun, 7 Jan 2024 00:08:18 +0330
Subject: [PATCH 056/176] Fix a typo

---
 src/dinglehopper/edit_distance.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py
index af1e047..8eec5e2 100644
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@@ -3,7 +3,6 @@ from typing import List
 
 from multimethod import multimethod
 from rapidfuzz.distance import Levenshtein
-from typing import List
 from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText

From 59a3882ce5fb9b0bb1c2f92148e06965d952f13b Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Jan 2024 17:57:51 +0100
Subject: [PATCH 057/176] =?UTF-8?q?=F0=9F=A7=B9=20GitHub=20Actions:=20Clea?=
 =?UTF-8?q?n=20up=20whitespace?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test_report.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index 253429f..908a593 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -17,4 +17,4 @@ jobs:
           artifact: /test-results-(.*)/
           name: 'Tests Results - $1'
           path: '*junit.xml'
-          reporter: java-junit  
\ No newline at end of file
+          reporter: java-junit

From ac9d360dcdf587588d076e56a5ffaaa7d1b2f1e6 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Jan 2024 19:04:36 +0100
Subject: [PATCH 058/176] =?UTF-8?q?=F0=9F=94=8D=20mypy:=20Make=20cli.proce?=
 =?UTF-8?q?ss()=20typed=20so=20mypy=20checks=20it=20(and=20issues=20no=20w?=
 =?UTF-8?q?arning)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index 99403ba..5d2000a 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -105,14 +105,14 @@ def json_float(value):
 
 
 def process(
-    gt,
-    ocr,
-    report_prefix,
-    reports_folder=".",
+    gt: str,
+    ocr: str,
+    report_prefix: str,
+    reports_folder: str = ".",
     *,
-    metrics=True,
-    differences=False,
-    textequiv_level="region",
+    metrics: bool = True,
+    differences: bool = False,
+    textequiv_level: str = "region",
 ):
     """Check OCR result against GT.
 

From 24c25b6fcde5303b73ab1014fd5a1f68b0a16088 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Jan 2024 19:30:20 +0100
Subject: [PATCH 059/176] =?UTF-8?q?=F0=9F=94=8D=20mypy:=20Avoid=20using=20?=
 =?UTF-8?q?check()=20for=20all=20attr=20validators?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/extracted_text.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index 7ef9d1d..992b3a9 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -122,7 +122,7 @@ class ExtractedText:
     segment_id = attr.ib(type=Optional[str])
 
     @segment_id.validator
-    def check(self, _, value):
+    def is_valid_segment_id(self, _, value):
         if value is None:
             return
         if not re.match(r"[\w\d_-]+", value):
@@ -138,12 +138,12 @@ class ExtractedText:
     _grapheme_clusters = attr.ib(type=Optional[List[str]])
 
     @segments.validator
-    def check(self, _, value):
+    def cant_set_both_segments_and_text(self, _, value):
         if value is not None and self._text is not None:
             raise ValueError("Can't have both segments and text")
 
     @joiner.validator
-    def check(self, _, value):
+    def is_valid_joiner(self, _, value):
         if self.segments is None:
             if value is not None:
                 raise ValueError("Can't have joiner without segments to join")
@@ -152,7 +152,7 @@ class ExtractedText:
                 raise ValueError(f"Unexcepted segment joiner value {repr(value)}")
 
     @_text.validator
-    def check(self, _, value):
+    def is_valid_text(self, _, value):
         if value is None:
             return
 
@@ -166,7 +166,7 @@ class ExtractedText:
             raise ValueError("Requires both text and grapheme clusters to be set")
 
     @_grapheme_clusters.validator
-    def check(self, _, value):
+    def are_valid_grapheme_clusters(self, _, value):
         if value is not None and self._text is None:
             raise ValueError("Requires both text and grapheme clusters to be set")
 

From 8166435958ca0ab8af023bf9bc94ce7da012e8f6 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Jan 2024 19:33:25 +0100
Subject: [PATCH 060/176] =?UTF-8?q?=F0=9F=94=8D=20mypy:=20Remove=20Extract?=
 =?UTF-8?q?edText.segments=20converter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/extracted_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index 992b3a9..af54d7c 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -132,7 +132,7 @@ class ExtractedText:
     # a. _text itself
     # b. or segments (ExtractedText) and a joiner
 
-    segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
+    segments = attr.ib(type=Optional[List["ExtractedText"]])
     joiner = attr.ib(type=Optional[str])
     _text = attr.ib(type=Optional[str])
     _grapheme_clusters = attr.ib(type=Optional[List[str]])

From ad316aeabca8da1ad179eaea00c4962f3d64e59a Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 9 Jan 2024 15:58:29 +0100
Subject: [PATCH 061/176] =?UTF-8?q?=F0=9F=94=8D=20mypy:=20Use=20a=20compat?=
 =?UTF-8?q?ible=20syntax=20for=20multimethod?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/character_error_rate.py | 10 ++++------
 src/dinglehopper/edit_distance.py        |  8 ++++----
 src/dinglehopper/word_error_rate.py      | 18 ++++++++----------
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/dinglehopper/character_error_rate.py b/src/dinglehopper/character_error_rate.py
index 5e2e02c..35d3b07 100644
--- a/src/dinglehopper/character_error_rate.py
+++ b/src/dinglehopper/character_error_rate.py
@@ -30,17 +30,15 @@ def character_error_rate_n(
     # XXX Should we really count newlines here?
 
 
-@multimethod
-def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
+@character_error_rate_n.register
+def _(reference: str, compared: str) -> Tuple[float, int]:
     seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", reference)))
     seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", compared)))
     return character_error_rate_n(seq1, seq2)
 
 
-@multimethod
-def character_error_rate_n(
-    reference: ExtractedText, compared: ExtractedText
-) -> Tuple[float, int]:
+@character_error_rate_n.register
+def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
     return character_error_rate_n(
         reference.grapheme_clusters, compared.grapheme_clusters
     )
diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py
index 8eec5e2..ac4a847 100644
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@@ -19,8 +19,8 @@ def distance(seq1: List[str], seq2: List[str]):
     return Levenshtein.distance(seq1, seq2)
 
 
-@multimethod
-def distance(s1: str, s2: str):
+@distance.register
+def _(s1: str, s2: str):
     """Compute the Levenshtein edit distance between two Unicode strings
 
     Note that this is different from levenshtein() as this function knows about Unicode
@@ -32,8 +32,8 @@ def distance(s1: str, s2: str):
     return Levenshtein.distance(seq1, seq2)
 
 
-@multimethod
-def distance(s1: ExtractedText, s2: ExtractedText):
+@distance.register
+def _(s1: ExtractedText, s2: ExtractedText):
     return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
 
 
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index 2e65760..afb4fe0 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -60,8 +60,8 @@ def words(s: str):
             yield word
 
 
-@multimethod
-def words(s: ExtractedText):
+@words.register
+def _(s: ExtractedText):
     return words(s.text)
 
 
@@ -70,8 +70,8 @@ def words_normalized(s: str):
     return words(unicodedata.normalize("NFC", s))
 
 
-@multimethod
-def words_normalized(s: ExtractedText):
+@words_normalized.register
+def _(s: ExtractedText):
     return words_normalized(s.text)
 
 
@@ -82,15 +82,13 @@ def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
     return word_error_rate_n(reference_seq, compared_seq)
 
 
-@multimethod
-def word_error_rate_n(
-    reference: ExtractedText, compared: ExtractedText
-) -> Tuple[float, int]:
+@word_error_rate_n.register
+def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
     return word_error_rate_n(reference.text, compared.text)
 
 
-@multimethod
-def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
+@word_error_rate_n.register
+def _(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
     reference_seq = list(reference)
     compared_seq = list(compared)
 

From 483e809691704295ffde56acb72df003634c7b0b Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 10 Jan 2024 19:12:07 +0100
Subject: [PATCH 062/176] =?UTF-8?q?=F0=9F=94=8D=20mypy:=20Use=20an=20almos?=
 =?UTF-8?q?t=20strict=20mypy=20configuration,=20and=20fix=20any=20issues?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml                  |  2 ++
 pyproject.toml                           | 11 +++++++++
 requirements-dev.txt                     |  1 +
 src/dinglehopper/align.py                |  3 +--
 src/dinglehopper/character_error_rate.py | 13 +++++++----
 src/dinglehopper/cli.py                  | 11 +++++----
 src/dinglehopper/cli_summarize.py        |  5 ++--
 src/dinglehopper/edit_distance.py        |  6 ++---
 src/dinglehopper/extracted_text.py       | 16 ++++++++-----
 src/dinglehopper/ocr_files.py            | 21 +++++++++++------
 src/dinglehopper/word_error_rate.py      | 29 ++++++++++++++----------
 11 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3ea6e96..8c25236 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,6 +28,8 @@ repos:
     hooks:
     -   additional_dependencies:
         -   types-setuptools
+        -   types-lxml
+        -   numpy  # for numpy plugin
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
diff --git a/pyproject.toml b/pyproject.toml
index ce32d56..05075e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,9 +60,20 @@ markers = [
 
 
 [tool.mypy]
+plugins = ["numpy.typing.mypy_plugin"]
+
 ignore_missing_imports = true
 
 
+strict = true
+
+disallow_subclassing_any = false
+# ❗ error: Class cannot subclass "Processor" (has type "Any")
+disallow_any_generics = false
+disallow_untyped_defs = false
+disallow_untyped_calls = false
+
+
 [tool.ruff]
 select = ["E", "F", "I"]
 ignore = [
diff --git a/requirements-dev.txt b/requirements-dev.txt
index de6003d..16ae880 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -7,5 +7,6 @@ ruff
 pytest-ruff
 
 mypy
+types-lxml
 types-setuptools
 pytest-mypy
diff --git a/src/dinglehopper/align.py b/src/dinglehopper/align.py
index c5f12f7..5d1f290 100644
--- a/src/dinglehopper/align.py
+++ b/src/dinglehopper/align.py
@@ -4,8 +4,7 @@ from math import ceil
 from typing import Optional
 
 from rapidfuzz.distance import Levenshtein
-
-from .edit_distance import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 
 def align(t1, t2):
diff --git a/src/dinglehopper/character_error_rate.py b/src/dinglehopper/character_error_rate.py
index 35d3b07..88a88f8 100644
--- a/src/dinglehopper/character_error_rate.py
+++ b/src/dinglehopper/character_error_rate.py
@@ -1,5 +1,5 @@
 import unicodedata
-from typing import List, Tuple
+from typing import List, Tuple, TypeVar
 
 from multimethod import multimethod
 from uniseg.graphemecluster import grapheme_clusters
@@ -7,6 +7,8 @@ from uniseg.graphemecluster import grapheme_clusters
 from .edit_distance import distance
 from .extracted_text import ExtractedText
 
+T = TypeVar("T")
+
 
 @multimethod
 def character_error_rate_n(
@@ -34,21 +36,24 @@ def character_error_rate_n(
 def _(reference: str, compared: str) -> Tuple[float, int]:
     seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", reference)))
     seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", compared)))
-    return character_error_rate_n(seq1, seq2)
+    cer, n = character_error_rate_n(seq1, seq2)
+    return cer, n
 
 
 @character_error_rate_n.register
 def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
-    return character_error_rate_n(
+    cer, n = character_error_rate_n(
         reference.grapheme_clusters, compared.grapheme_clusters
     )
+    return cer, n
 
 
-def character_error_rate(reference, compared) -> float:
+def character_error_rate(reference: T, compared: T) -> float:
     """
     Compute character error rate.
 
     :return: character error rate
     """
+    cer: float
     cer, _ = character_error_rate_n(reference, compared)
     return cer
diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index 5d2000a..a58a2af 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -1,5 +1,6 @@
 import os
 from collections import Counter
+from typing import List
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -76,7 +77,7 @@ def gen_diff_report(
         if o is not None:
             o_pos += len(o)
 
-    found_differences = dict(Counter(elem for elem in found_differences))
+    counted_differences = dict(Counter(elem for elem in found_differences))
 
     return (
         """
@@ -87,7 +88,7 @@ def gen_diff_report(
         """.format(
             gtx, ocrx
         ),
-        found_differences,
+        counted_differences,
     )
 
 
@@ -113,7 +114,7 @@ def process(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
-):
+) -> None:
     """Check OCR result against GT.
 
     The @click decorators change the signature of the decorated functions, so we keep
@@ -122,8 +123,8 @@ def process(
 
     gt_text = extract(gt, textequiv_level=textequiv_level)
     ocr_text = extract(ocr, textequiv_level=textequiv_level)
-    gt_words: list = list(words_normalized(gt_text))
-    ocr_words: list = list(words_normalized(ocr_text))
+    gt_words: List[str] = list(words_normalized(gt_text))
+    ocr_words: List[str] = list(words_normalized(ocr_text))
 
     assert isinstance(gt_text, ExtractedText)
     assert isinstance(ocr_text, ExtractedText)
diff --git a/src/dinglehopper/cli_summarize.py b/src/dinglehopper/cli_summarize.py
index e0c20cb..c49911b 100644
--- a/src/dinglehopper/cli_summarize.py
+++ b/src/dinglehopper/cli_summarize.py
@@ -1,5 +1,6 @@
 import json
 import os
+from typing import Dict
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -13,8 +14,8 @@ def process(reports_folder, occurrences_threshold=1):
     wer_list = []
     cer_sum = 0
     wer_sum = 0
-    diff_c = {}
-    diff_w = {}
+    diff_c: Dict[str, int] = {}
+    diff_w: Dict[str, int] = {}
 
     for report in os.listdir(reports_folder):
         if report.endswith(".json"):
diff --git a/src/dinglehopper/edit_distance.py b/src/dinglehopper/edit_distance.py
index ac4a847..ec564ae 100644
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@@ -9,7 +9,7 @@ from .extracted_text import ExtractedText
 
 
 @multimethod
-def distance(seq1: List[str], seq2: List[str]):
+def distance(seq1: List[str], seq2: List[str]) -> int:
     """Compute the Levenshtein edit distance between two lists of grapheme clusters.
 
     This assumes that the grapheme clusters are already normalized.
@@ -20,7 +20,7 @@ def distance(seq1: List[str], seq2: List[str]):
 
 
 @distance.register
-def _(s1: str, s2: str):
+def _(s1: str, s2: str) -> int:
     """Compute the Levenshtein edit distance between two Unicode strings
 
     Note that this is different from levenshtein() as this function knows about Unicode
@@ -33,7 +33,7 @@ def _(s1: str, s2: str):
 
 
 @distance.register
-def _(s1: ExtractedText, s2: ExtractedText):
+def _(s1: ExtractedText, s2: ExtractedText) -> int:
     return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
 
 
diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index af54d7c..e4b0915 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -4,7 +4,7 @@ import re
 import unicodedata
 from contextlib import suppress
 from itertools import repeat
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import attr
 import numpy as np
@@ -173,10 +173,11 @@ class ExtractedText:
     normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
 
     @property
-    def text(self):
+    def text(self) -> str:
         if self._text is not None:
             return self._text
         else:
+            assert self.joiner is not None and self.segments is not None
             return self.joiner.join(s.text for s in self.segments)
 
     @functools.cached_property
@@ -186,6 +187,7 @@ class ExtractedText:
         This property is cached.
         """
 
+        assert self.joiner is not None
         if len(self.joiner) > 0:
             joiner_grapheme_cluster = list(grapheme_clusters(self.joiner))
             assert len(joiner_grapheme_cluster) == 1  # see joiner's check above
@@ -203,6 +205,7 @@ class ExtractedText:
         else:
             # TODO Test with text extracted at glyph level (joiner == "")
             clusters = []
+            assert self.segments is not None
             for seg in self.segments:
                 clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster
             clusters = clusters[:-1]
@@ -218,6 +221,7 @@ class ExtractedText:
             else:
                 # Recurse
                 segment_id_for_pos = []
+                assert self.joiner is not None and self.segments is not None
                 for s in self.segments:
                     seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
                     segment_id_for_pos.extend(seg_ids)
@@ -280,7 +284,7 @@ def invert_dict(d):
     return {v: k for k, v in d.items()}
 
 
-def get_textequiv_unicode(text_segment, nsmap) -> str:
+def get_textequiv_unicode(text_segment: Any, nsmap: Dict[str, str]) -> str:
     """Get the TextEquiv/Unicode text of the given PAGE text element."""
     segment_id = text_segment.attrib["id"]
     textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
@@ -304,7 +308,7 @@ def get_first_textequiv(textequivs, segment_id):
     if np.any(~nan_mask):
         if np.any(nan_mask):
             log.warning("TextEquiv without index in %s.", segment_id)
-        index = np.nanargmin(indices)
+        index = int(np.nanargmin(indices))
     else:
         # try ordering by conf
         confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
@@ -313,7 +317,7 @@ def get_first_textequiv(textequivs, segment_id):
                 "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
                 segment_id,
             )
-            index = np.nanargmax(confidences)
+            index = int(np.nanargmax(confidences))
         else:
             # fallback to first entry in case of neither index or conf present
             log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
@@ -321,7 +325,7 @@ def get_first_textequiv(textequivs, segment_id):
     return textequivs[index]
 
 
-def get_attr(te, attr_name) -> float:
+def get_attr(te: Any, attr_name: str) -> float:
     """Extract the attribute for the given name.
 
     Note: currently only handles numeric values!
diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index be66719..f9bd977 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -1,6 +1,6 @@
 import os
 import sys
-from typing import Iterator
+from typing import Dict, Iterator, Optional
 
 import chardet
 from lxml import etree as ET
@@ -10,11 +10,11 @@ from uniseg.graphemecluster import grapheme_clusters
 from .extracted_text import ExtractedText, normalize_sbb
 
 
-def alto_namespace(tree: ET.ElementTree) -> str:
+def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
     """Return the ALTO namespace used in the given ElementTree.
 
     This relies on the assumption that, in any given ALTO file, the root element has the
-    local name "alto". We do not check if the files uses any valid ALTO namespace.
+    local name "alto". We do not check if the file uses any valid ALTO namespace.
     """
     root_name = ET.QName(tree.getroot().tag)
     if root_name.localname == "alto":
@@ -23,8 +23,15 @@ def alto_namespace(tree: ET.ElementTree) -> str:
         raise ValueError("Not an ALTO tree")
 
 
-def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]:
-    nsmap = {"alto": alto_namespace(tree)}
+def alto_nsmap(tree: ET._ElementTree) -> Dict[str, str]:
+    alto_ns = alto_namespace(tree)
+    if alto_ns is None:
+        raise ValueError("Could not determine ALTO namespace")
+    return {"alto": alto_ns}
+
+
+def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
+    nsmap = alto_nsmap(tree)
     for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
         line_id = line.attrib.get("ID")
         line_text = " ".join(
@@ -37,7 +44,7 @@ def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]:
         # FIXME hardcoded SBB normalization
 
 
-def alto_extract(tree: ET.ElementTree) -> ExtractedText:
+def alto_extract(tree: ET._ElementTree) -> ExtractedText:
     """Extract text from the given ALTO ElementTree."""
     return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None, None)
 
@@ -98,7 +105,7 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
     if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
         ro_children = list(group)
 
-        ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
+        ro_children = [child for child in ro_children if "index" in child.attrib.keys()]
         ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
     elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]:
         ro_children = list(group)
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index afb4fe0..b6e0a3a 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -1,5 +1,5 @@
 import unicodedata
-from typing import Iterable, Tuple
+from typing import Generator, Iterable, Tuple, TypeVar
 
 import uniseg.wordbreak
 from multimethod import multimethod
@@ -7,6 +7,8 @@ from rapidfuzz.distance import Levenshtein
 
 from .extracted_text import ExtractedText
 
+T = TypeVar("T")
+
 # Did we patch uniseg.wordbreak.word_break already?
 word_break_patched = False
 
@@ -32,7 +34,7 @@ def patch_word_break():
 
 
 @multimethod
-def words(s: str):
+def words(s: str) -> Generator[str, None, None]:
     """Extract words from a string"""
 
     global word_break_patched
@@ -61,34 +63,36 @@ def words(s: str):
 
 
 @words.register
-def _(s: ExtractedText):
-    return words(s.text)
+def _(s: ExtractedText) -> Generator[str, None, None]:
+    yield from words(s.text)
 
 
 @multimethod
-def words_normalized(s: str):
-    return words(unicodedata.normalize("NFC", s))
+def words_normalized(s: str) -> Generator[str, None, None]:
+    yield from words(unicodedata.normalize("NFC", s))
 
 
 @words_normalized.register
-def _(s: ExtractedText):
-    return words_normalized(s.text)
+def _(s: ExtractedText) -> Generator[str, None, None]:
+    yield from words_normalized(s.text)
 
 
 @multimethod
 def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
     reference_seq = list(words_normalized(reference))
     compared_seq = list(words_normalized(compared))
-    return word_error_rate_n(reference_seq, compared_seq)
+    wer, n = word_error_rate_n(reference_seq, compared_seq)
+    return wer, n
 
 
 @word_error_rate_n.register
 def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
-    return word_error_rate_n(reference.text, compared.text)
+    wer, n = word_error_rate_n(reference.text, compared.text)
+    return wer, n
 
 
 @word_error_rate_n.register
-def _(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
+def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
     reference_seq = list(reference)
     compared_seq = list(compared)
 
@@ -102,6 +106,7 @@ def _(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
     return d / n, n
 
 
-def word_error_rate(reference, compared) -> float:
+def word_error_rate(reference: T, compared: T) -> float:
+    wer: float
     wer, _ = word_error_rate_n(reference, compared)
     return wer

From 344f96dca91adb9d194b5bde4bd691196a09019e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 18:09:29 +0000
Subject: [PATCH 063/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab-ci.yml | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 .gitlab-ci.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..1201955
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,7 @@
+stages:
+  - triggers
+
+mirror:
+  stage: triggers
+  trigger:
+    include: .gitlab/mirror.yml
\ No newline at end of file

From af83b35f2372a83ad880496e329b85788be04738 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 19:18:35 +0100
Subject: [PATCH 064/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .gitlab/mirror.yml

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
new file mode 100644
index 0000000..c1cb76c
--- /dev/null
+++ b/.gitlab/mirror.yml
@@ -0,0 +1,15 @@
+stages:
+  - pull
+
+default:
+  image: alpine
+
+pull-gitlab:
+  stage: pull
+  script:
+    - echo "This is redundant"
+
+pull-github:
+  stage: pull
+  script:
+    - git pull https://github.com/qurator-spk/dinglehopper.git

From 6d8afc27b341d429d825d69aadc3a411815ac4a7 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 19:51:18 +0100
Subject: [PATCH 065/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index c1cb76c..9c6fa30 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -12,4 +12,5 @@ pull-gitlab:
 pull-github:
   stage: pull
   script:
+    - apk add --no-cache git
     - git pull https://github.com/qurator-spk/dinglehopper.git

From e083688c66fbdb0323361245f015186aadc1f3a2 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:01:56 +0100
Subject: [PATCH 066/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 9c6fa30..43e6183 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -11,6 +11,7 @@ pull-gitlab:
 
 pull-github:
   stage: pull
-  script:
+  before_script:
     - apk add --no-cache git
+  script:
     - git pull https://github.com/qurator-spk/dinglehopper.git

From c77e8f51aba9e0201df8fe30689d4befe7915137 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:09:30 +0100
Subject: [PATCH 067/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 43e6183..294f1e1 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -1,9 +1,6 @@
 stages:
   - pull
 
-default:
-  image: alpine
-
 pull-gitlab:
   stage: pull
   script:
@@ -11,7 +8,7 @@ pull-gitlab:
 
 pull-github:
   stage: pull
-  before_script:
-    - apk add --no-cache git
+#  before_script:
+#    - apk add --no-cache git
   script:
     - git pull https://github.com/qurator-spk/dinglehopper.git

From dc390cd3f809957068f0d795134d64440da837cb Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:17:10 +0100
Subject: [PATCH 068/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 294f1e1..9846030 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -1,6 +1,9 @@
 stages:
   - pull
 
+default:
+  image: debian
+
 pull-gitlab:
   stage: pull
   script:

From 81391132f0ee0ce65f6ab1fdaa4de3572a1a9405 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:22:28 +0100
Subject: [PATCH 069/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 9846030..33bc502 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -14,4 +14,5 @@ pull-github:
 #  before_script:
 #    - apk add --no-cache git
   script:
+    - whoami
     - git pull https://github.com/qurator-spk/dinglehopper.git

From d0ddfa68a12d19a045d5f9ebe0cccb300b1ce1ef Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:25:00 +0100
Subject: [PATCH 070/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 33bc502..9863e9e 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -11,8 +11,8 @@ pull-gitlab:
 
 pull-github:
   stage: pull
-#  before_script:
-#    - apk add --no-cache git
-  script:
+  before_script:
     - whoami
+    - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+  script:
     - git pull https://github.com/qurator-spk/dinglehopper.git

From 484da90d273bc8593092f1a11d1741251e7ab6eb Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:34:07 +0100
Subject: [PATCH 071/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab-ci.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1201955..966b806 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,7 +1,10 @@
+variables:
+  http_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
+
 stages:
   - triggers
 
 mirror:
   stage: triggers
   trigger:
-    include: .gitlab/mirror.yml
\ No newline at end of file
+    include: .gitlab/mirror.yml

From 6d947a9ca96144248c8e04560f118554309a5dbf Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:37:41 +0100
Subject: [PATCH 072/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 9863e9e..82117a7 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -12,7 +12,8 @@ pull-gitlab:
 pull-github:
   stage: pull
   before_script:
-    - whoami
+    - whoami; env
     - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
   script:
+    - whoami; env
     - git pull https://github.com/qurator-spk/dinglehopper.git

From 10d423f0451d52aa18c658f8e1d2ca22def87836 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:41:11 +0100
Subject: [PATCH 073/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab-ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 966b806..f1c4fdd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,5 +1,8 @@
 variables:
   http_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
+  https_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
+  HTTP_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
+  HTTPS_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
 
 stages:
   - triggers

From 10ccba989e651771d7c7142b9ed8ca86419ce165 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jan 2024 20:43:34 +0100
Subject: [PATCH 074/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 82117a7..8a39ac8 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -16,4 +16,6 @@ pull-github:
     - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
   script:
     - whoami; env
-    - git pull https://github.com/qurator-spk/dinglehopper.git
+    - git remote add github https://github.com/qurator-spk/dinglehopper.git
+    - git remote -v
+    - git pull github master

From 21c44d426eefaf7f9687229ea36b63ed96da6051 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 12:38:47 +0100
Subject: [PATCH 075/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8c25236..f3aa0b5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.11
+    rev: v0.1.12
     hooks:
     -   args:
         -   --fix

From ff34c65c1e107e7b48be2160a15e8b3dea2a7d2c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 12:42:40 +0100
Subject: [PATCH 076/176] =?UTF-8?q?=F0=9F=94=8D=20ruff:=20Remove=20ignore?=
 =?UTF-8?q?=20configuration,=20we=20use=20multimethods=20in=20a=20compatib?=
 =?UTF-8?q?le=20way=20now?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 05075e7..41d45ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,6 +76,3 @@ disallow_untyped_calls = false
 
 [tool.ruff]
 select = ["E", "F", "I"]
-ignore = [
-    "F811",  # multimethods are considered redefinitions by ruff
-]

From a95a85a889a6b4a4f90818f1a35b99c6034c0b05 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 12:45:32 +0100
Subject: [PATCH 077/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 8a39ac8..d621e31 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -16,6 +16,9 @@ pull-github:
     - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
   script:
     - whoami; env
+
+    - git remove remove github 2>/dev/null || true
     - git remote add github https://github.com/qurator-spk/dinglehopper.git
     - git remote -v
+
     - git pull github master

From 83cef3106f3d7b23c590c81cd63785796539a5f5 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 12:46:40 +0100
Subject: [PATCH 078/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index d621e31..d99cafc 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -17,7 +17,7 @@ pull-github:
   script:
     - whoami; env
 
-    - git remove remove github 2>/dev/null || true
+    - git remote remove github 2>/dev/null || true
     - git remote add github https://github.com/qurator-spk/dinglehopper.git
     - git remote -v
 

From 5eba65f0976f57f4a0fb929fe82710c43b0367c2 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:08:26 +0100
Subject: [PATCH 079/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20Tr?=
 =?UTF-8?q?igger=20only=20on=20default=20branch=20(and=20do=20not=20hardco?=
 =?UTF-8?q?de=20it)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab-ci.yml     | 2 ++
 .gitlab/mirror.yml | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f1c4fdd..7b252af 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -11,3 +11,5 @@ mirror:
   stage: triggers
   trigger:
     include: .gitlab/mirror.yml
+  rules:
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index d99cafc..86a4021 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -17,8 +17,10 @@ pull-github:
   script:
     - whoami; env
 
+    - if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
+
     - git remote remove github 2>/dev/null || true
     - git remote add github https://github.com/qurator-spk/dinglehopper.git
     - git remote -v
 
-    - git pull github master
+    - git pull github "$CI_COMMIT_BRANCH"

From 6cfb49fe39dfdb0292c383c9b79848d7f6c387e0 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:14:22 +0100
Subject: [PATCH 080/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20Pu?=
 =?UTF-8?q?sh=20after=20pulling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 86a4021..9785e77 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -1,5 +1,6 @@
 stages:
   - pull
+  - push
 
 default:
   image: debian
@@ -24,3 +25,13 @@ pull-github:
     - git remote -v
 
     - git pull github "$CI_COMMIT_BRANCH"
+
+push-gitlab:
+  stage: push
+  script:
+    - git push origin "$CI_COMMIT_BRANCH"
+
+push-github:
+  stage: push
+  script:
+    - git push github "$CI_COMMIT_BRANCH"

From f8e31089b3e7f8db7d5ef8648aa99015b3fba45c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:17:38 +0100
Subject: [PATCH 081/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20Pu?=
 =?UTF-8?q?sh=20after=20pulling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 9785e77..5c2eb25 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -28,10 +28,14 @@ pull-github:
 
 push-gitlab:
   stage: push
+  before_script:
+    - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
   script:
     - git push origin "$CI_COMMIT_BRANCH"
 
 push-github:
   stage: push
+  before_script:
+    - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
   script:
     - git push github "$CI_COMMIT_BRANCH"

From 76c4533aa5f83a45e4a402b9c5c373c15a101440 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:25:52 +0100
Subject: [PATCH 082/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20Pu?=
 =?UTF-8?q?sh=20after=20pulling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 62 +++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 5c2eb25..92275e9 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -1,41 +1,47 @@
 stages:
-  - pull
-  - push
+    - check
+    - pull
+    - push
 
 default:
-  image: debian
+    image: debian
+
+
+check:
+    state: check
+
+    script:
+        - whoami; env
+        - if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
+
 
 pull-gitlab:
-  stage: pull
-  script:
-    - echo "This is redundant"
+    stage: pull
+    script:
+        - echo "This is redundant"
 
 pull-github:
-  stage: pull
-  before_script:
-    - whoami; env
-    - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-  script:
-    - whoami; env
+    stage: pull
+    before_script:
+        - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+    script:
+        - git remote remove github 2>/dev/null || true
+        - git remote add github https://github.com/qurator-spk/dinglehopper.git
+        - git remote -v
 
-    - if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
+        - git pull github "$CI_COMMIT_BRANCH"
 
-    - git remote remove github 2>/dev/null || true
-    - git remote add github https://github.com/qurator-spk/dinglehopper.git
-    - git remote -v
-
-    - git pull github "$CI_COMMIT_BRANCH"
 
 push-gitlab:
-  stage: push
-  before_script:
-    - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-  script:
-    - git push origin "$CI_COMMIT_BRANCH"
+    stage: push
+    before_script:
+        - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+    script:
+        - git push origin "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
 
 push-github:
-  stage: push
-  before_script:
-    - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-  script:
-    - git push github "$CI_COMMIT_BRANCH"
+    stage: push
+    before_script:
+        - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+    script:
+        - git push github "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"

From 250ee2b7f2043078463ba2a721fd0940fb4b5cf8 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:26:55 +0100
Subject: [PATCH 083/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20Pu?=
 =?UTF-8?q?sh=20after=20pulling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab/mirror.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/mirror.yml b/.gitlab/mirror.yml
index 92275e9..f3591a2 100644
--- a/.gitlab/mirror.yml
+++ b/.gitlab/mirror.yml
@@ -8,7 +8,7 @@ default:
 
 
 check:
-    state: check
+    stage: check
 
     script:
         - whoami; env

From 7e033b6f03883d0e778a56da748a30f5e0e760b8 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:35:08 +0100
Subject: [PATCH 084/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20De?=
 =?UTF-8?q?pend=20on=20child=20pipeline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7b252af..f93f4a2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ stages:
 
 mirror:
   stage: triggers
+  strategy: depend
   trigger:
     include: .gitlab/mirror.yml
   rules:

From 4b64398cec6328fbeaadc3e5d97251aa0564d747 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jan 2024 16:38:27 +0100
Subject: [PATCH 085/176] =?UTF-8?q?=F0=9F=9A=A7=20GitLab=20CI=20Test:=20De?=
 =?UTF-8?q?pend=20on=20child=20pipeline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f93f4a2..bdcb93a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,8 +9,8 @@ stages:
 
 mirror:
   stage: triggers
-  strategy: depend
   trigger:
     include: .gitlab/mirror.yml
+    strategy: depend
   rules:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

From 4016c016389f62f6b66ae532e0b1f3af095ad23e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 26 Mar 2024 18:58:56 +0100
Subject: [PATCH 086/176] =?UTF-8?q?=F0=9F=90=9B=20README.md:=20Fix=20test?=
 =?UTF-8?q?=20badge?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 00fd899..1693d89 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ compares a ground truth (GT) document page with a OCR result page to compute
 metrics and a word/character differences report. It also supports batch processing by
 generating, aggregating and summarizing multiple reports.
 
-[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
+[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
 [![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
 [![License](https://img.shields.io/badge/License-Apache-blue)](#license)
 [![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)

From 0e3d24cac17fbd24f38d34c3e8d72161af3f75a4 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 26 Mar 2024 19:01:25 +0100
Subject: [PATCH 087/176] =?UTF-8?q?=F0=9F=90=9B=20README.md:=20Fix=20badge?=
 =?UTF-8?q?=20(for=20real)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1693d89..035133c 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ compares a ground truth (GT) document page with a OCR result page to compute
 metrics and a word/character differences report. It also supports batch processing by
 generating, aggregating and summarizing multiple reports.
 
-[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
+[![Tests](https://github.com/qurator-spk/dinglehopper/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
 [![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
 [![License](https://img.shields.io/badge/License-Apache-blue)](#license)
 [![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)

From 4d4ead4cc80075aa415be8beebfe81298a247144 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 26 Mar 2024 19:34:22 +0100
Subject: [PATCH 088/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20word=20segmentatio?=
 =?UTF-8?q?n=20with=20uniseg=200.8.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt                    | 2 +-
 src/dinglehopper/word_error_rate.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 851fec1..8f863cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg >= 0.7.2
+uniseg >= 0.8.0
 numpy
 colorama
 MarkupSafe
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index b6e0a3a..b759a69 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -24,7 +24,7 @@ def patch_word_break():
 
     def new_word_break(c, index=0):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return "ALetter"
+            return uniseg.wordbreak.WordBreak.ALETTER
         else:
             return old_word_break(c, index)
 

From 4dc6b7dc042ea6b3a517799617339e79db36f670 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 26 Mar 2024 19:40:07 +0100
Subject: [PATCH 089/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f3aa0b5..819d14e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,12 +11,12 @@ repos:
     -   id: check-ast
 
 -   repo: https://github.com/psf/black
-    rev: 23.12.1
+    rev: 24.3.0
     hooks:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.12
+    rev: v0.3.4
     hooks:
     -   args:
         -   --fix
@@ -24,7 +24,7 @@ repos:
         id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
+    rev: v1.9.0
     hooks:
     -   additional_dependencies:
         -   types-setuptools
@@ -33,6 +33,6 @@ repos:
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
-    rev: v0.1.2
+    rev: v0.2.1
     hooks:
     -   id: pre-commit-update

From 19d1a00817a648490a47d01d5fe0f668b7c608d6 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 17:36:05 +0100
Subject: [PATCH 090/176] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20(Black)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/tests/test_integ_ocrd_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/tests/test_integ_ocrd_cli.py b/src/dinglehopper/tests/test_integ_ocrd_cli.py
index b30d2b0..5bcc189 100644
--- a/src/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/src/dinglehopper/tests/test_integ_ocrd_cli.py
@@ -34,9 +34,9 @@ def test_ocrd_cli(tmp_path):
             "-O",
             "OCR-D-OCR-CALAMARI-EVAL",
         ]
-        sys.argv[
-            1:
-        ] = args  # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+        sys.argv[1:] = (
+            args  # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+        )
         result = runner.invoke(ocrd_dinglehopper, args)
     assert result.exit_code == 0
     result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))

From 5d9f0c482fb83555626168960bea98c0130759d2 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 17:57:53 +0100
Subject: [PATCH 091/176] =?UTF-8?q?=F0=9F=90=9B=20Check=20that=20we=20alwa?=
 =?UTF-8?q?ys=20get=20a=20valid=20ALTO=20namespace=20(satifies=20mypy)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocr_files.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index f9bd977..0c4fa04 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -18,6 +18,7 @@ def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
     """
     root_name = ET.QName(tree.getroot().tag)
     if root_name.localname == "alto":
+        assert isinstance(root_name.namespace, str)
         return root_name.namespace
     else:
         raise ValueError("Not an ALTO tree")

From a1c1d0ad490280b815d5a4f75196438c1c696349 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 18:31:33 +0100
Subject: [PATCH 092/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Add=20mypy=20?=
 =?UTF-8?q?dependencies?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes gh-106.
---
 .pre-commit-config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 819d14e..3a83efc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,6 +30,9 @@ repos:
         -   types-setuptools
         -   types-lxml
         -   numpy  # for numpy plugin
+        -   attrs
+        -   multimethod
+        -   rapidfuzz
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic/pre-commit-update

From c29a80bc818def251562436ef2a17db819964185 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 18:49:13 +0100
Subject: [PATCH 093/176] =?UTF-8?q?=F0=9F=93=A6=20v0.9.5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index a71ce37..2cecfd3 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.4",
+  "version": "0.9.5",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {

From 945aec5673c6e362f287695f436b9ef99bcfe97e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 19:01:49 +0100
Subject: [PATCH 094/176] =?UTF-8?q?=E2=9C=92=20README-DEV:=20Releasing=20a?=
 =?UTF-8?q?=20new=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README-DEV.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README-DEV.md b/README-DEV.md
index a3441b1..3ec432f 100644
--- a/README-DEV.md
+++ b/README-DEV.md
@@ -43,3 +43,15 @@ This project optionally uses [pre-commit](https://pre-commit.com) to check commi
 
 - Install pre-commit, e.g. `pip install -r requirements-dev.txt`
 - Install the repo-local git hooks: `pre-commit install`
+
+
+# Releasing a new version
+
+- Update `ocrd-tool.json`
+- `git commit`
+- `git tag vx.y.z`
+- `git push && git push --tags`
+- The GitHub Actions workflow `release` will now create
+  a. a new release on GitHub and
+  b. a new release on PyPI
+- Currently requires a review for PYPI?

From 932bfafc7d29b42ace37a15ab4eb68933ba670bd Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 19:44:09 +0100
Subject: [PATCH 095/176] =?UTF-8?q?=F0=9F=A7=B9=20Make=20process=5Fdir()?=
 =?UTF-8?q?=20keyword=20arguments=20keyword-only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py                      |  8 ++++----
 src/dinglehopper/tests/test_integ_cli_dir.py | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index a58a2af..a4b3ce5 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -187,7 +187,7 @@ def process(
 
 
 def process_dir(
-    gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
+    gt, ocr, report_prefix, reports_folder, *, metrics, differences, textequiv_level
 ):
     for gt_file in os.listdir(gt):
         gt_file_path = os.path.join(gt, gt_file)
@@ -269,9 +269,9 @@ def main(
                 ocr,
                 report_prefix,
                 reports_folder,
-                metrics,
-                differences,
-                textequiv_level,
+                metrics=metrics,
+                differences=differences,
+                textequiv_level=textequiv_level,
             )
     else:
         process(
diff --git a/src/dinglehopper/tests/test_integ_cli_dir.py b/src/dinglehopper/tests/test_integ_cli_dir.py
index c065130..65e59d9 100644
--- a/src/dinglehopper/tests/test_integ_cli_dir.py
+++ b/src/dinglehopper/tests/test_integ_cli_dir.py
@@ -21,9 +21,9 @@ def test_cli_directory(tmp_path):
         os.path.join(data_dir, "directory-test", "ocr"),
         "report",
         str(tmp_path / "reports"),
-        False,
-        True,
-        "line",
+        metrics=False,
+        differences=True,
+        textequiv_level="line",
     )
 
     assert os.path.exists(tmp_path / "reports/1.xml-report.json")
@@ -45,9 +45,9 @@ def test_cli_fail_without_gt(tmp_path):
         os.path.join(data_dir, "directory-test", "ocr"),
         "report",
         str(tmp_path / "reports"),
-        False,
-        True,
-        "line",
+        metrics=False,
+        differences=True,
+        textequiv_level="line",
     )
 
     assert len(os.listdir(tmp_path / "reports")) == 2 * 2

From be7c1dd25dbb7a1b810f337ffc7502f27c906b0d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Mar 2024 21:09:34 +0100
Subject: [PATCH 096/176] =?UTF-8?q?=F0=9F=A7=B9=20Make=20from=5Ftext=5Fseg?=
 =?UTF-8?q?ment()'s=20textequiv=5Flevel=20keyword-only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/extracted_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index e4b0915..c7bcba7 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -235,7 +235,7 @@ class ExtractedText:
         return self._segment_id_for_pos[pos]
 
     @classmethod
-    def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
+    def from_text_segment(cls, text_segment, nsmap, *, textequiv_level="region"):
         """Build an ExtractedText from a PAGE content text element"""
 
         localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}

From fe1a713d559b11461f0101f729ce87a45e4b51d9 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 4 Apr 2024 19:33:47 +0200
Subject: [PATCH 097/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3a83efc..25d6201 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.4
+    rev: v0.3.5
     hooks:
     -   args:
         -   --fix

From 32d403753325a595b5bdb7ee2c76bc5aa99ce3cc Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 4 Apr 2024 19:38:27 +0200
Subject: [PATCH 098/176] =?UTF-8?q?=E2=9A=99=20cli:=20Annotate=20types=20i?=
 =?UTF-8?q?n=20process=5Fdir()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index a4b3ce5..78ac33c 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -187,8 +187,15 @@ def process(
 
 
 def process_dir(
-    gt, ocr, report_prefix, reports_folder, *, metrics, differences, textequiv_level
-):
+    gt: str,
+    ocr: str,
+    report_prefix: str,
+    reports_folder: str = ".",
+    *,
+    metrics: bool = True,
+    differences: bool = False,
+    textequiv_level: str = "region",
+) -> None:
     for gt_file in os.listdir(gt):
         gt_file_path = os.path.join(gt, gt_file)
         ocr_file_path = os.path.join(ocr, gt_file)

From edabffec7e1309f58f89694bdbcc067f1f74b5e6 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 4 Apr 2024 19:46:08 +0200
Subject: [PATCH 099/176] =?UTF-8?q?=F0=9F=A7=B9=20tests:=20Move=20comment?=
 =?UTF-8?q?=20out=20of=20the=20code=20(bad=20style=20+=20weird=20formattin?=
 =?UTF-8?q?g)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/tests/test_integ_ocrd_cli.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/tests/test_integ_ocrd_cli.py b/src/dinglehopper/tests/test_integ_ocrd_cli.py
index 5bcc189..fbda5f4 100644
--- a/src/dinglehopper/tests/test_integ_ocrd_cli.py
+++ b/src/dinglehopper/tests/test_integ_ocrd_cli.py
@@ -34,9 +34,8 @@ def test_ocrd_cli(tmp_path):
             "-O",
             "OCR-D-OCR-CALAMARI-EVAL",
         ]
-        sys.argv[1:] = (
-            args  # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
-        )
+        # Hack to satisfy ocrd_cli_wrap_processor() check for arguments
+        sys.argv[1:] = args
         result = runner.invoke(ocrd_dinglehopper, args)
     assert result.exit_code == 0
     result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))

From 98d7928f45107b8c16dfb4fbc32a49e68e5cfbaa Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Apr 2024 20:27:47 +0200
Subject: [PATCH 100/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25d6201..807d645 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer

From 2383730a55297be8903d01c1c8e5686a274539ef Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 8 Apr 2024 20:33:03 +0200
Subject: [PATCH 101/176] =?UTF-8?q?=E2=9C=94=20Test=20using=20empty=20file?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test edge cases + empty files, e.g. empty text content and a Unicode BOM character.

See also gh-79.
---
 .../tests/test_integ_empty_files.py           | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 src/dinglehopper/tests/test_integ_empty_files.py

diff --git a/src/dinglehopper/tests/test_integ_empty_files.py b/src/dinglehopper/tests/test_integ_empty_files.py
new file mode 100644
index 0000000..5c90ed1
--- /dev/null
+++ b/src/dinglehopper/tests/test_integ_empty_files.py
@@ -0,0 +1,35 @@
+from __future__ import division, print_function
+
+import math
+
+import pytest
+
+from .. import character_error_rate, plain_text
+from .util import working_directory
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "gt_file_content,ocr_file_content,cer_expected",
+    [
+        ("", "Lorem ipsum", math.inf),
+        ("Lorem ipsum", "", 1.0),
+        ("\ufeff", "Lorem ipsum", math.inf),
+        ("Lorem ipsum", "\ufeff", 1.0),
+        ("", "", 0.0),
+        ("\ufeff", "", 0.0),
+        ("", "\ufeff", 0.0),
+    ],
+)
+def test_empty_files(tmp_path, gt_file_content, ocr_file_content, cer_expected):
+    with working_directory(tmp_path):
+
+        with open("gt.txt", "w") as gtf:
+            gtf.write(gt_file_content)
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write(ocr_file_content)
+
+        gt_text = plain_text("gt.txt")
+        ocr_text = plain_text("ocr.txt")
+
+        assert character_error_rate(gt_text, ocr_text) == cer_expected

From 79701e410d1c343b7ac03b9de4567cb042723939 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 29 Apr 2024 08:42:14 +0200
Subject: [PATCH 102/176] Fix some typos (found by `codespell` and `typos`)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 README.md                                    | 4 ++--
 src/dinglehopper/extracted_text.py           | 2 +-
 src/dinglehopper/notebooks/Levenshtein.ipynb | 4 ++--
 src/dinglehopper/word_error_rate.py          | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 035133c..76fcc5a 100644
--- a/README.md
+++ b/README.md
@@ -100,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.
 
 If you are summarizing many reports and have used the `--differences` flag while
 generating them, it may be useful to limit the number of differences reported by using
-the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
+the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML
 report, making it easier to open and navigate. Note that the JSON report will still
 contain all differences. Example:
 ~~~
-dinglehopper-summarize output_folder/ --occurences-threshold 10
+dinglehopper-summarize output_folder/ --occurrences-threshold 10
 ~~~
 
 ### dinglehopper-line-dirs
diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index c7bcba7..6dcf0a7 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -329,7 +329,7 @@ def get_attr(te: Any, attr_name: str) -> float:
     """Extract the attribute for the given name.
 
     Note: currently only handles numeric values!
-    Other or non existend values are encoded as np.nan.
+    Other or non existent values are encoded as np.nan.
     """
     attr_value = te.attrib.get(attr_name)
     try:
diff --git a/src/dinglehopper/notebooks/Levenshtein.ipynb b/src/dinglehopper/notebooks/Levenshtein.ipynb
index 876bee3..b9671d7 100644
--- a/src/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/src/dinglehopper/notebooks/Levenshtein.ipynb
@@ -391,7 +391,7 @@
     "\\text{CER} = \\frac{i + s + d}{n}\n",
     "$$\n",
     "\n",
-    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
+    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)"
    ]
   },
   {
@@ -680,7 +680,7 @@
       "        return cat in unwanted_categories or subcat in unwanted_subcategories\n",
       "\n",
       "    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
-      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
+      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n",
       "    for word in uniseg.wordbreak.words(s):\n",
       "        if all(unwanted(c) for c in word):\n",
       "            pass\n",
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index b759a69..578850f 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -54,7 +54,7 @@ def words(s: str) -> Generator[str, None, None]:
 
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
     # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
-    # only whitespace, punctation "or similar characters."
+    # only whitespace, punctuation "or similar characters."
     for word in uniseg.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass

From 58a688b1757c06f0ea94a0188644779963b837f3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 16:57:53 +0200
Subject: [PATCH 103/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 807d645..640db3b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,12 +11,12 @@ repos:
     -   id: check-ast
 
 -   repo: https://github.com/psf/black
-    rev: 24.3.0
+    rev: 24.4.2
     hooks:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.5
+    rev: v0.4.3
     hooks:
     -   args:
         -   --fix
@@ -24,7 +24,7 @@ repos:
         id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.9.0
+    rev: v1.10.0
     hooks:
     -   additional_dependencies:
         -   types-setuptools
@@ -35,7 +35,7 @@ repos:
         -   rapidfuzz
         id: mypy
 
--   repo: https://gitlab.com/vojko.pribudic/pre-commit-update
-    rev: v0.2.1
+-   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
+    rev: v0.3.1post2
     hooks:
     -   id: pre-commit-update

From e34adbf41cac4318564cf10043c02b17df80a32f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 16:59:18 +0200
Subject: [PATCH 104/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20Python=203.12=20su?=
 =?UTF-8?q?pport=20by=20requiring=20ocrd=20>=3D=202.65.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8f863cc..846d389 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ uniseg >= 0.8.0
 numpy
 colorama
 MarkupSafe
-ocrd >= 2.20.1
+ocrd >= 2.65.0
 attrs
 multimethod >= 1.3
 tqdm

From 0d5c6d5a6230bbd989a92d2ca45b7019d82bf9c7 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 17:02:52 +0200
Subject: [PATCH 105/176] =?UTF-8?q?=E2=9C=94=20Test=20on=20Python=203.13?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9b95bff..dddfd91 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
 
     runs-on: "ubuntu-latest"
 
@@ -34,6 +34,7 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
+          allow-prereleases: true
 
       - name: Checkout
         uses: actions/checkout@v3

From fbcb9160fd23f5cefdc4ae28caaf16b1f2b1346d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 17:09:05 +0200
Subject: [PATCH 106/176] =?UTF-8?q?=F0=9F=90=9B=20GHA:=20Install=20possibl?=
 =?UTF-8?q?e=20lxml=20build=20requirements=20(if=20building=20from=20sourc?=
 =?UTF-8?q?e)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dddfd91..25ca298 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,6 +39,9 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Install possible lxml build requirements (if building from source)
+        run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
+
       - name: Update pip
         run: python3 -m pip install -U pip
       - name: Install requirements*.txt

From 86e723cd53969b63f61c10108f5a8d1deb86a2e3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 17:25:59 +0200
Subject: [PATCH 107/176] =?UTF-8?q?=F0=9F=90=9B=20GHA:=20Install=20possibl?=
 =?UTF-8?q?e=20shapely=20build=20requirements=20(if=20building=20from=20so?=
 =?UTF-8?q?urce)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 25ca298..2b53ba8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -41,6 +41,8 @@ jobs:
 
       - name: Install possible lxml build requirements (if building from source)
         run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
+      - name: Install possible shapely build requirements (if building from source)
+        run: sudo apt-get install -y libgeos-dev
 
       - name: Update pip
         run: python3 -m pip install -U pip

From e72d1e37ea05810cc39b595508fc37655c21ff54 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 17:42:09 +0200
Subject: [PATCH 108/176] =?UTF-8?q?Revert=20"=E2=9C=94=20Test=20on=20Pytho?=
 =?UTF-8?q?n=203.13"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 0d5c6d5a6230bbd989a92d2ca45b7019d82bf9c7.
---
 .github/workflows/test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2b53ba8..f049c2c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
 
     runs-on: "ubuntu-latest"
 
@@ -34,7 +34,6 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          allow-prereleases: true
 
       - name: Checkout
         uses: actions/checkout@v3

From 41a0fad352520f83aa764cb5ac28172ebccf1bc0 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 17:48:48 +0200
Subject: [PATCH 109/176] =?UTF-8?q?=F0=9F=93=A6=20v0.9.6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 2cecfd3..27ee989 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.5",
+  "version": "0.9.6",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {

From b336f98271036830dcf5d2456ffa8b87752e9c16 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 18:14:16 +0200
Subject: [PATCH 110/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20reading=20plain=20?=
 =?UTF-8?q?text=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As reported by @tallemeersch in gh-107, newlines were not removed for plain text files.
Fix this by stripping the lines as suggested.

Fixes gh-107.
---
 src/dinglehopper/ocr_files.py            | 4 ++--
 src/dinglehopper/tests/test_ocr_files.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 0c4fa04..1593f44 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
     for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
         line_id = line.attrib.get("ID")
         line_text = " ".join(
-            string.attrib.get("CONTENT")
+            string.attrib.get("CONTENT", "")
             for string in line.iterfind("alto:String", namespaces=nsmap)
         )
         normalized_text = normalize_sbb(line_text)
@@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
     with open(filename, "r", encoding=fileencoding) as f:
         return ExtractedText(
             None,
-            [make_segment(no, line) for no, line in enumerate(f.readlines())],
+            [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
             "\n",
             None,
             None,
diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py
index 4790c85..342507a 100644
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@@ -177,8 +177,8 @@ def test_text():
 def test_plain(tmp_path):
     with working_directory(tmp_path):
         with open("ocr.txt", "w") as ocrf:
-            ocrf.write("AAAAB")
+            ocrf.write("First, a line.\nAnd a second line.\n")
 
         result = plain_text("ocr.txt")
-        expected = "AAAAB"
+        expected = "First, a line.\nAnd a second line."
         assert result == expected

From a534b5e28e4317b150bf43d5bf3ef2d314afff90 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 13 May 2024 21:16:29 +0200
Subject: [PATCH 111/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 640db3b..2a2cf1e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.3
+    rev: v0.4.4
     hooks:
     -   args:
         -   --fix

From c91234daba29744586c6eab17575b793560d95f4 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 13 May 2024 21:17:42 +0200
Subject: [PATCH 112/176] =?UTF-8?q?=E2=9C=94=20GitHub=20Actions:=20Update?=
 =?UTF-8?q?=20used=20actions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/release.yml | 8 ++++----
 .github/workflows/test.yml    | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 8c193df..3f51bd7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Upgrade pip
         run: python3 -m pip install --upgrade pip
       - name: Install setuptools
@@ -32,7 +32,7 @@ jobs:
       - name: Build package
         run: python3 -m pip install --upgrade build && python3 -m build
       - name: Upload dist
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: dist
           path: dist/
@@ -42,7 +42,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download dist
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: dist
           path: dist/
@@ -61,7 +61,7 @@ jobs:
       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
     steps:
       - name: Download dist
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: dist
           path: dist/
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f049c2c..f40c830 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -31,12 +31,12 @@ jobs:
 
     steps:
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install possible lxml build requirements (if building from source)
         run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
@@ -56,7 +56,7 @@ jobs:
             cd src
             python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
       - name: Upload test results
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: success() || failure()
         with:
           name: test-results-${{matrix.python-version}}

From bc5818da9f9d0ae44fcc7580ed458eb8a900be89 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 14 May 2024 15:56:08 +0200
Subject: [PATCH 113/176] =?UTF-8?q?=E2=9C=94=20GitHub=20Actions:=20Update?=
 =?UTF-8?q?=20used=20actions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test_report.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index 908a593..26f411b 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -12,7 +12,7 @@ jobs:
   report:
     runs-on: ubuntu-latest
     steps:
-      - uses: dorny/test-reporter@v1.7.0
+      - uses: dorny/test-reporter@v1
         with:
           artifact: /test-results-(.*)/
           name: 'Tests Results - $1'

From cd68a973cb43ce33790d6f52612a684d933a31e4 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Sun, 26 May 2024 09:18:00 +0200
Subject: [PATCH 114/176] Fix typo

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/dinglehopper/extracted_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index 6dcf0a7..acfbf78 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -149,7 +149,7 @@ class ExtractedText:
                 raise ValueError("Can't have joiner without segments to join")
         if self.segments is not None:
             if value not in ("", " ", "\n"):
-                raise ValueError(f"Unexcepted segment joiner value {repr(value)}")
+                raise ValueError(f"Unexpected segment joiner value {repr(value)}")
 
     @_text.validator
     def is_valid_text(self, _, value):

From 4047f8b6e537158233f69d2257062b0038d122a0 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 9 Jul 2024 21:01:31 +0200
Subject: [PATCH 115/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20loading=20ocrd-too?=
 =?UTF-8?q?l.json=20for=20Python=203.12?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt             | 1 +
 src/dinglehopper/ocrd_cli.py | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 846d389..6741fa2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.7.0
 chardet
+importlib_resources
diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index 8eebdc0..401db6b 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -1,3 +1,4 @@
+import importlib_resources
 import json
 import os
 
@@ -5,11 +6,14 @@ import click
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
-from pkg_resources import resource_string
 
 from .cli import process as cli_process
 
-OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
+OCRD_TOOL = json.loads(
+    importlib_resources.files(__name__)
+    .joinpath("ocrd-tool.json")
+    .read_text(encoding="utf-8", errors="strict")
+)
 
 
 @click.command()

From d1a224761537fe0239c1486c2f6bc778d70ef76e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 9 Jul 2024 21:07:59 +0200
Subject: [PATCH 116/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2a2cf1e..504773b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.4
+    rev: v0.5.1
     hooks:
     -   args:
         -   --fix
@@ -24,7 +24,7 @@ repos:
         id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.10.1
     hooks:
     -   additional_dependencies:
         -   types-setuptools
@@ -36,6 +36,6 @@ repos:
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
-    rev: v0.3.1post2
+    rev: v0.3.3post1
     hooks:
     -   id: pre-commit-update

From 2ee37ed4e39b4e973bc807ebba26a97afed578c5 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jul 2024 16:25:38 +0200
Subject: [PATCH 117/176] =?UTF-8?q?=F0=9F=8E=A8=20Sort=20imports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index 401db6b..cfaca39 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -1,8 +1,8 @@
-import importlib_resources
 import json
 import os
 
 import click
+import importlib_resources
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id

From cf998443c15673fcef976cd00251b38cf7158a0e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jul 2024 17:15:24 +0200
Subject: [PATCH 118/176] =?UTF-8?q?=E2=9A=99=20ruff:=20Update=20settings?=
 =?UTF-8?q?=20(select=20=E2=86=92=20lint.select)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 41d45ba..25efdcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,5 +74,5 @@ disallow_untyped_defs = false
 disallow_untyped_calls = false
 
 
-[tool.ruff]
+[tool.ruff.lint]
 select = ["E", "F", "I"]

From 129e6eb427b0d5d306f76c7f443ee7cd08e83495 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 11 Jul 2024 17:25:38 +0200
Subject: [PATCH 119/176] =?UTF-8?q?=F0=9F=93=A6=20v0.9.7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 27ee989..f4572c7 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.6",
+  "version": "0.9.7",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {

From 2e9e88cc1e3db0390636a5ca79f315b1c0d153e1 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 19 Jul 2024 09:56:40 +0200
Subject: [PATCH 120/176] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hook?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 504773b..b6f88ef 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.1
+    rev: v0.5.3
     hooks:
     -   args:
         -   --fix

From 27ad145c7e303439ef413505b1cb1178bc23370b Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 19 Jul 2024 09:58:01 +0200
Subject: [PATCH 121/176] =?UTF-8?q?=E2=9A=99=20pyproject.toml:=20Add=20lic?=
 =?UTF-8?q?ense.file?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 25efdcd..c2263e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ authors = [
 ]
 description = "The OCR evaluation tool"
 readme = "README.md"
+license.file = "LICENSE"
 requires-python = ">=3.8"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
 

From 6d1daf1dfe99575655ec0de0f7155c047e3b9b30 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 19 Jul 2024 14:41:54 +0200
Subject: [PATCH 122/176] =?UTF-8?q?=E2=9C=A8=20Support=20--version=20optio?=
 =?UTF-8?q?n=20in=20CLI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index 78ac33c..b67e9cc 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -234,6 +234,7 @@ def process_dir(
     metavar="LEVEL",
 )
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
+@click.version_option()
 def main(
     gt,
     ocr,

From f2e290dffea6179a21d35afea0c1902ca1bbb0fa Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 19 Jul 2024 14:54:46 +0200
Subject: [PATCH 123/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20--version=20option?=
 =?UTF-8?q?=20in=20OCR-D=20CLI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd_cli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index cfaca39..4da4960 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -25,6 +25,7 @@ def ocrd_dinglehopper(*args, **kwargs):
 class OcrdDinglehopperEvaluate(Processor):
     def __init__(self, *args, **kwargs):
         kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
+        kwargs["version"] = OCRD_TOOL["version"]
         super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
 
     def process(self):

From 3233dbcc8f036ebe83ae268813006f1476218d7c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 22 Jul 2024 16:54:33 +0200
Subject: [PATCH 124/176] =?UTF-8?q?=E2=9C=94=20pre-commit:=20Add=20license?=
 =?UTF-8?q?=20check?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 10 ++++++++--
 pyproject.toml          | 31 +++++++++++++++++++++++++++++++
 requirements-dev.txt    |  2 ++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b6f88ef..4baed11 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.3
+    rev: v0.5.4
     hooks:
     -   args:
         -   --fix
@@ -24,7 +24,7 @@ repos:
         id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.1
+    rev: v1.11.0
     hooks:
     -   additional_dependencies:
         -   types-setuptools
@@ -39,3 +39,9 @@ repos:
     rev: v0.3.3post1
     hooks:
     -   id: pre-commit-update
+
+-   repo: https://github.com/dhatim/python-license-check
+    rev: 0.9.2
+    hooks:
+    -   id: liccheck
+        language: system
diff --git a/pyproject.toml b/pyproject.toml
index c2263e0..62bac78 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -77,3 +77,34 @@ disallow_untyped_calls = false
 
 [tool.ruff.lint]
 select = ["E", "F", "I"]
+
+
+[tool.liccheck]
+authorized_licenses = [
+    "bsd",
+    "new bsd",
+    "bsd license",
+    "new bsd license",
+    "simplified bsd",
+    "apache",
+    "apache 2.0",
+    "apache software license",
+    "apache software",
+    "apache license 2.0",
+    "gnu lgpl",
+    "lgpl with exceptions or zpl",
+    "GNU Library or Lesser General Public License (LGPL)",
+    "GNU Lesser General Public License v3 (LGPLv3)",
+    "GNU Lesser General Public License v2 or later (LGPLv2+)",
+    "mit",
+    "mit license",
+    "python software foundation",
+    "Historical Permission Notice and Disclaimer (HPND)",
+    "public domain",
+    'The Unlicense (Unlicense)',
+    "isc",
+    'Mozilla Public License 2.0 (MPL 2.0)',
+]
+unauthorized_licenses = [
+    "gpl v3",
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 16ae880..f9f748a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,3 +10,5 @@ mypy
 types-lxml
 types-setuptools
 pytest-mypy
+
+liccheck

From 1753ed4d1363c9d22cdb56494dbeb7eaed78901b Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 17:02:52 +0200
Subject: [PATCH 125/176] =?UTF-8?q?=E2=9C=94=20Test=20on=20Python=203.13?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f40c830..387f7a2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
 
     runs-on: "ubuntu-latest"
 
@@ -34,6 +34,7 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          allow-prereleases: true
 
       - name: Checkout
         uses: actions/checkout@v4

From 9c7c104dcec89663a4ddab0e83334db79a639184 Mon Sep 17 00:00:00 2001
From: joschrew <91774427+joschrew@users.noreply.github.com>
Date: Wed, 2 Oct 2024 15:29:36 +0200
Subject: [PATCH 126/176] Add Dockerfile and Makefile to create ocr-d image

---
 Dockerfile     | 22 ++++++++++++++++++++++
 Makefile       | 26 ++++++++++++++++++++++++++
 pyproject.toml |  2 +-
 3 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile
 create mode 100644 Makefile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..a66d718
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,22 @@
+ARG DOCKER_BASE_IMAGE
+FROM $DOCKER_BASE_IMAGE
+ARG VCS_REF
+ARG BUILD_DATE
+LABEL \
+    maintainer="https://ocr-d.de/kontakt" \
+    org.label-schema.vcs-ref=$VCS_REF \
+    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
+    org.label-schema.build-date=$BUILD_DATE
+
+WORKDIR /build/dinglehopper
+COPY pyproject.toml .
+COPY src/dinglehopper/ocrd-tool.json .
+COPY src ./src
+COPY requirements.txt .
+COPY README.md .
+COPY Makefile .
+RUN make install
+RUN rm -rf /build/dinglehopper
+
+WORKDIR /data
+VOLUME ["/data"]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..babaf5f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,26 @@
+PYTHON = python3
+PIP = pip3
+PYTHONIOENCODING=utf8
+
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
+DOCKER_TAG = ocrd/dinglehopper
+
+help:
+	@echo
+	@echo "  Targets"
+	@echo
+	@echo "    install Install full Python package via pip"
+	@echo "    docker  Build the ocrd/dinglehopper docker image"
+
+# Install Python package via pip
+install:
+	$(PIP) install .
+
+docker:
+	docker build \
+	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
+	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
+	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
+	-t $(DOCKER_TAG) .
+
+.PHONY: help install docker
diff --git a/pyproject.toml b/pyproject.toml
index 62bac78..a94e0b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
 where = ["src"]
 
 [tool.setuptools.package-data]
-dinglehopper = ["templates/*"]
+dinglehopper = ["templates/*", "*.json"]
 
 
 [tool.pytest.ini_options]

From 6ecf49a355eb2f413a38552ca8187ab794b98d3f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 7 Oct 2024 17:39:42 +0200
Subject: [PATCH 127/176] Update Dockerfile

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 Dockerfile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index a66d718..a7bda6f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,11 @@ FROM $DOCKER_BASE_IMAGE
 ARG VCS_REF
 ARG BUILD_DATE
 LABEL \
-    maintainer="https://ocr-d.de/kontakt" \
+LABEL \
+    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
+    org.label-schema.vcs-ref=$VCS_REF \
+    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
+    org.label-schema.build-date=$BUILD_DATE
     org.label-schema.vcs-ref=$VCS_REF \
     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
     org.label-schema.build-date=$BUILD_DATE

From 6b82293670ea7b642d65e7114a9a4d0c8897a619 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 7 Oct 2024 17:41:59 +0200
Subject: [PATCH 128/176] Update Dockerfile

I fancy-clicked @bertsky's change suggestion, which duplicated some labels. Now fancy-clicking the fix, fingers crossed...
---
 Dockerfile | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a7bda6f..04e7330 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,15 +2,11 @@ ARG DOCKER_BASE_IMAGE
 FROM $DOCKER_BASE_IMAGE
 ARG VCS_REF
 ARG BUILD_DATE
-LABEL \
 LABEL \
     maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
     org.label-schema.vcs-ref=$VCS_REF \
     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
     org.label-schema.build-date=$BUILD_DATE
-    org.label-schema.vcs-ref=$VCS_REF \
-    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
-    org.label-schema.build-date=$BUILD_DATE
 
 WORKDIR /build/dinglehopper
 COPY pyproject.toml .

From 058042accbb7a9425220714c947a5e50193d9220 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 16 Apr 2025 08:59:58 +0200
Subject: [PATCH 129/176] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4baed11..403658e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
@@ -11,12 +11,12 @@ repos:
     -   id: check-ast
 
 -   repo: https://github.com/psf/black
-    rev: 24.4.2
+    rev: 25.1.0
     hooks:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.4
+    rev: v0.11.5
     hooks:
     -   args:
         -   --fix
@@ -24,7 +24,7 @@ repos:
         id: ruff
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.11.0
+    rev: v1.15.0
     hooks:
     -   additional_dependencies:
         -   types-setuptools
@@ -36,7 +36,7 @@ repos:
         id: mypy
 
 -   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
-    rev: v0.3.3post1
+    rev: v0.6.1
     hooks:
     -   id: pre-commit-update
 

From 7f8a8dd56453ef4df6290615a60011355e247287 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 16 Apr 2025 09:10:43 +0200
Subject: [PATCH 130/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20for=20changed=20AP?=
 =?UTF-8?q?I=20of=20uniseg's=20word=5Fbreak?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt                    | 2 +-
 src/dinglehopper/word_error_rate.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6741fa2..0b3d819 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg >= 0.8.0
+uniseg >= 0.9.1
 numpy
 colorama
 MarkupSafe
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index 578850f..ec039b3 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -22,11 +22,11 @@ def patch_word_break():
     """
     old_word_break = uniseg.wordbreak.word_break
 
-    def new_word_break(c, index=0):
+    def new_word_break(c):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return uniseg.wordbreak.WordBreak.ALETTER
+            return uniseg.wordbreak.Word_Break.ALetter
         else:
-            return old_word_break(c, index)
+            return old_word_break(c)
 
     uniseg.wordbreak.word_break = new_word_break
     global word_break_patched

From badfa9c99e07fa3faf09c276cd1f8bc3745e5b9a Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 16 Apr 2025 09:25:44 +0200
Subject: [PATCH 131/176] =?UTF-8?q?=E2=9A=99=20=20GitHub=20Actions:=20Don'?=
 =?UTF-8?q?t=20test=20on=20Python=203.8=20anymore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 387f7a2..277d4ba 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
 
     runs-on: "ubuntu-latest"
 

From ce7886af23f2f43691a81002da72060dac902ae4 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 16 Apr 2025 10:57:10 +0200
Subject: [PATCH 132/176] =?UTF-8?q?=E2=9A=99=20=20pyproject.toml:=20Update?=
 =?UTF-8?q?=20supported=20Python=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a94e0b9..7668e13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 description = "The OCR evaluation tool"
 readme = "README.md"
 license.file = "LICENSE"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
 
 dynamic = ["version", "dependencies", "optional-dependencies"]

From d3aa9eb5201833859c15049cf8203085a1dd7fca Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 16 Apr 2025 11:09:33 +0200
Subject: [PATCH 133/176] =?UTF-8?q?=E2=9A=99=20=20liccheck:=20update=20per?=
 =?UTF-8?q?missable=20licenses=20(mit-cmu,=20psf=202.0,=20iscl)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7668e13..3c02d33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,11 +98,15 @@ authorized_licenses = [
     "GNU Lesser General Public License v2 or later (LGPLv2+)",
     "mit",
     "mit license",
+    "mit-cmu",
     "python software foundation",
+    "psf",
+    "psf-2.0",
     "Historical Permission Notice and Disclaimer (HPND)",
     "public domain",
     'The Unlicense (Unlicense)',
     "isc",
+    "ISC License (ISCL)",
     'Mozilla Public License 2.0 (MPL 2.0)',
 ]
 unauthorized_licenses = [

From 63031b30bff9a7dc0033e8da4f3dd646e3e93949 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Fri, 11 Apr 2025 17:25:17 +0200
Subject: [PATCH 134/176] Port to OCR-D/core API v3

---
 .dockerignore                   |   5 ++
 Dockerfile                      |  33 +++++++----
 Makefile                        |   9 ++-
 src/dinglehopper/ocrd-tool.json |  10 +---
 src/dinglehopper/ocrd_cli.py    | 100 ++++++++++++++------------------
 5 files changed, 84 insertions(+), 73 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..a8312db
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,5 @@
+src/dinglehopper/tests
+dist
+build
+*.egg-info
+.git
diff --git a/Dockerfile b/Dockerfile
index 04e7330..d4b2b76 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,17 +6,30 @@ LABEL \
     maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
     org.label-schema.vcs-ref=$VCS_REF \
     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
-    org.label-schema.build-date=$BUILD_DATE
+    org.label-schema.build-date=$BUILD_DATE \
+    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
+    org.opencontainers.image.title="dinglehopper" \
+    org.opencontainers.image.description="The OCR evaluation tool" \
+    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
+    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
+    org.opencontainers.image.revision=$VCS_REF \
+    org.opencontainers.image.created=$BUILD_DATE \
+    org.opencontainers.image.base.name=ocrd/core
+
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+# avoid HOME/.local/share (hard to predict USER here)
+# so let XDG_DATA_HOME coincide with fixed system location
+# (can still be overridden by derived stages)
+ENV XDG_DATA_HOME /usr/local/share
+# avoid the need for an extra volume for persistent resource user db
+# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
+ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 
 WORKDIR /build/dinglehopper
-COPY pyproject.toml .
-COPY src/dinglehopper/ocrd-tool.json .
-COPY src ./src
-COPY requirements.txt .
-COPY README.md .
-COPY Makefile .
-RUN make install
-RUN rm -rf /build/dinglehopper
+COPY . .
+RUN make install && rm -rf /build/dinglehopper
 
 WORKDIR /data
-VOLUME ["/data"]
+VOLUME /data
diff --git a/Makefile b/Makefile
index babaf5f..2a4b13c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,9 @@
 PYTHON = python3
 PIP = pip3
 PYTHONIOENCODING=utf8
+PYTEST_ARGS = -vv
 
-DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 DOCKER_TAG = ocrd/dinglehopper
 
 help:
@@ -16,6 +17,12 @@ help:
 install:
 	$(PIP) install .
 
+install-dev:
+	$(PIP) install -e .
+
+test:
+	pytest $(PYTEST_ARGS)
+
 docker:
 	docker build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index f4572c7..00d5d2b 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,17 +1,13 @@
 {
   "version": "0.9.7",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
+  "dockerhub": "ocrd/dinglehopper",
   "tools": {
     "ocrd-dinglehopper": {
       "executable": "ocrd-dinglehopper",
+      "input_file_grp_cardinality": 2,
+      "output_file_grp_cardinality": 1,
       "description": "Evaluate OCR text against ground truth with dinglehopper",
-      "input_file_grp": [
-        "OCR-D-GT-PAGE",
-        "OCR-D-OCR"
-      ],
-      "output_file_grp": [
-        "OCR-D-OCR-EVAL"
-      ],
       "categories": [
         "Quality assurance"
       ],
diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index 4da4960..9696ff9 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -1,83 +1,73 @@
-import json
+from functools import cached_property
 import os
+from typing import Optional
 
 import click
-import importlib_resources
+from ocrd_models import OcrdFileType
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
+from ocrd_utils import make_file_id
 
 from .cli import process as cli_process
 
-OCRD_TOOL = json.loads(
-    importlib_resources.files(__name__)
-    .joinpath("ocrd-tool.json")
-    .read_text(encoding="utf-8", errors="strict")
-)
-
-
 @click.command()
 @ocrd_cli_options
 def ocrd_dinglehopper(*args, **kwargs):
     return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 
-
 class OcrdDinglehopperEvaluate(Processor):
-    def __init__(self, *args, **kwargs):
-        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
-        kwargs["version"] = OCRD_TOOL["version"]
-        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
 
-    def process(self):
-        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+    @cached_property
+    def executable(self):
+        return 'ocrd-dinglehopper'
 
-        log = getLogger("processor.OcrdDinglehopperEvaluate")
+    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
 
+        assert self.parameter
         metrics = self.parameter["metrics"]
         textequiv_level = self.parameter["textequiv_level"]
-        gt_grp, ocr_grp = self.input_file_grp.split(",")
 
-        input_file_tuples = self.zip_input_files(on_error="abort")
-        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
-            if not gt_file or not ocr_file:
-                # file/page was not found in this group
-                continue
-            gt_file = self.workspace.download_file(gt_file)
-            ocr_file = self.workspace.download_file(ocr_file)
-            page_id = gt_file.pageId
+        try:
+            gt_file, ocr_file = input_files
+            assert gt_file, 'missing GT file'
+            assert ocr_file, 'missing OCR file'
+            assert gt_file.local_filename
+            assert ocr_file.local_filename
+        except (ValueError, AssertionError) as err:
+            self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
+            return
 
-            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
+        page_id = gt_file.pageId
 
-            file_id = make_file_id(ocr_file, self.output_file_grp)
-            report_prefix = os.path.join(self.output_file_grp, file_id)
+        file_id = make_file_id(ocr_file, self.output_file_grp)
+        report_prefix = os.path.join(self.output_file_grp, file_id)
 
-            # Process the files
-            try:
-                os.mkdir(self.output_file_grp)
-            except FileExistsError:
-                pass
-            cli_process(
-                gt_file.local_filename,
-                ocr_file.local_filename,
-                report_prefix,
-                metrics=metrics,
-                textequiv_level=textequiv_level,
+        # Process the files
+        try:
+            os.mkdir(self.output_file_grp)
+        except FileExistsError:
+            pass
+        cli_process(
+            gt_file.local_filename,
+            ocr_file.local_filename,
+            report_prefix,
+            metrics=metrics,
+            textequiv_level=textequiv_level,
+        )
+
+        # Add reports to the workspace
+        for report_suffix, mimetype in [
+            [".html", "text/html"],
+            [".json", "application/json"],
+        ]:
+            self.workspace.add_file(
+                file_id=file_id + report_suffix,
+                file_grp=self.output_file_grp,
+                page_id=page_id,
+                mimetype=mimetype,
+                local_filename=report_prefix + report_suffix,
             )
 
-            # Add reports to the workspace
-            for report_suffix, mimetype in [
-                [".html", "text/html"],
-                [".json", "application/json"],
-            ]:
-                self.workspace.add_file(
-                    file_id=file_id + report_suffix,
-                    file_grp=self.output_file_grp,
-                    page_id=page_id,
-                    mimetype=mimetype,
-                    local_filename=report_prefix + report_suffix,
-                )
-
 
 if __name__ == "__main__":
     ocrd_dinglehopper()

From f287386c0e8b315a077e2400965b56c1e9759cc4 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 16 Apr 2025 14:49:23 +0200
Subject: [PATCH 135/176] =?UTF-8?q?=F0=9F=A7=B9Don't=20pin=20uniseg=20and?=
 =?UTF-8?q?=20rapidfuzz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Breakage with the newest uniseg API was fixed in master.

Can't see any issue with rapidfuzz, so removing that pin, too.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0b3d819..123187b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ uniseg >= 0.9.1
 numpy
 colorama
 MarkupSafe
-ocrd >= 2.65.0
+ocrd >= 3.3.0
 attrs
 multimethod >= 1.3
 tqdm

From 8c1b6d65f57f1fba9c7e71980cb97934460b7073 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Fri, 11 Apr 2025 17:49:53 +0200
Subject: [PATCH 136/176] Dockerfile: build ocrd-all-tool.json

---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index d4b2b76..75dfcdd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,6 +29,9 @@ ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 
 WORKDIR /build/dinglehopper
 COPY . .
+COPY ocrd-tool.json .
+# prepackage ocrd-tool.json as ocrd-all-tool.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
 RUN make install && rm -rf /build/dinglehopper
 
 WORKDIR /data

From c0aa82d18885402ddc0093dfc75a07e0c23a0e5b Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <kba@users.noreply.github.com>
Date: Wed, 16 Apr 2025 14:00:05 +0200
Subject: [PATCH 137/176] OCR-D processor: properly handle missing or
 non-downloaded GT/OCR file

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 src/dinglehopper/ocrd_cli.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index 9696ff9..52da817 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -27,14 +27,19 @@ class OcrdDinglehopperEvaluate(Processor):
         metrics = self.parameter["metrics"]
         textequiv_level = self.parameter["textequiv_level"]
 
-        try:
-            gt_file, ocr_file = input_files
-            assert gt_file, 'missing GT file'
-            assert ocr_file, 'missing OCR file'
-            assert gt_file.local_filename
-            assert ocr_file.local_filename
-        except (ValueError, AssertionError) as err:
-            self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
+        # wrong number of inputs: let fail
+        gt_file, ocr_file = input_files
+        # missing on either side: skip (zip_input_files already warned)
+        if not gt_file or not ocr_file:
+            return
+        # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
+        if not gt_file.local_filename:
+            if config.OCRD_MISSING_INPUT == 'ABORT':
+                raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
+            return
+        if not ocr_file.local_filename:
+            if config.OCRD_MISSING_INPUT == 'ABORT':
+                raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
             return
 
         page_id = gt_file.pageId

From 4162836612661a0232ff8783af56c65561df8c48 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <kba@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:54:58 +0200
Subject: [PATCH 138/176] ocrd_cli: no need to check fileGrp dir exists

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 src/dinglehopper/ocrd_cli.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index 52da817..90db7d1 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -45,17 +45,11 @@ class OcrdDinglehopperEvaluate(Processor):
         page_id = gt_file.pageId
 
         file_id = make_file_id(ocr_file, self.output_file_grp)
-        report_prefix = os.path.join(self.output_file_grp, file_id)
-
-        # Process the files
-        try:
-            os.mkdir(self.output_file_grp)
-        except FileExistsError:
-            pass
         cli_process(
             gt_file.local_filename,
             ocr_file.local_filename,
-            report_prefix,
+            file_id,
+            self.output_file_grp,
             metrics=metrics,
             textequiv_level=textequiv_level,
         )

From f6a2c94520dcf79892278320b29e3906d4a5f4bb Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <kba@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:55:42 +0200
Subject: [PATCH 139/176] ocrd_cli: but do check for existing output files

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
---
 src/dinglehopper/ocrd_cli.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index 90db7d1..dbf59be 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -59,8 +59,12 @@ class OcrdDinglehopperEvaluate(Processor):
             [".html", "text/html"],
             [".json", "application/json"],
         ]:
+            output_file_id = file_id + report_suffix
+            output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
+            if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
+                raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
             self.workspace.add_file(
-                file_id=file_id + report_suffix,
+               file_id=output_file_id,
                 file_grp=self.output_file_grp,
                 page_id=page_id,
                 mimetype=mimetype,

From 831a24fc4ca606cc04bd37a8217a52654e67d3f4 Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Wed, 16 Apr 2025 19:03:13 +0200
Subject: [PATCH 140/176] typo: report_prefix -> file_id

---
 src/dinglehopper/ocrd_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index dbf59be..fa4747f 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -68,7 +68,7 @@ class OcrdDinglehopperEvaluate(Processor):
                 file_grp=self.output_file_grp,
                 page_id=page_id,
                 mimetype=mimetype,
-                local_filename=report_prefix + report_suffix,
+                local_filename=file_id + report_suffix,
             )
 
 

From b7bdca4ac88a57660814aa83848ff1b2f86fecd6 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 17 Apr 2025 08:09:06 +0200
Subject: [PATCH 141/176] =?UTF-8?q?=F0=9F=90=9B=20Makefile:=20Make=20phony?=
 =?UTF-8?q?=20targets=20.PHONY?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2a4b13c..12f342a 100644
--- a/Makefile
+++ b/Makefile
@@ -30,4 +30,4 @@ docker:
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 	-t $(DOCKER_TAG) .
 
-.PHONY: help install docker
+.PHONY: help install install-dev test docker

From d974369e13e3bf5f20e24084a27b912430717150 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 17 Apr 2025 08:10:56 +0200
Subject: [PATCH 142/176] =?UTF-8?q?=F0=9F=90=9B=20Docker:=20Fix=20descript?=
 =?UTF-8?q?ion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 75dfcdd..f942d78 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,7 +9,7 @@ LABEL \
     org.label-schema.build-date=$BUILD_DATE \
     org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
     org.opencontainers.image.title="dinglehopper" \
-    org.opencontainers.image.description="The OCR evaluation tool" \
+    org.opencontainers.image.description="An OCR evaluation tool" \
     org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
     org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
     org.opencontainers.image.revision=$VCS_REF \

From 13ab1ae150481b915c856700c6b0348fb4ba6884 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 17 Apr 2025 08:26:36 +0200
Subject: [PATCH 143/176] =?UTF-8?q?=F0=9F=90=9B=20Docker:=20Use=20same=20v?=
 =?UTF-8?q?endor=20as=20license=20for=20now?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index f942d78..e497d16 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ LABEL \
     org.label-schema.vcs-ref=$VCS_REF \
     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
     org.label-schema.build-date=$BUILD_DATE \
-    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
+    org.opencontainers.image.vendor="qurator" \
     org.opencontainers.image.title="dinglehopper" \
     org.opencontainers.image.description="An OCR evaluation tool" \
     org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \

From ef817cb343a28241ad5acf4ca956551c816450fb Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 17 Apr 2025 08:37:37 +0200
Subject: [PATCH 144/176] =?UTF-8?q?=F0=9F=93=A6=20v0.10.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 00d5d2b..f63392a 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.9.7",
+  "version": "0.10.0",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "dockerhub": "ocrd/dinglehopper",
   "tools": {

From f6dfb77f94b69637d8d6ee8153d5ebfa3d6de90f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 17 Apr 2025 08:51:32 +0200
Subject: [PATCH 145/176] =?UTF-8?q?=F0=9F=90=9B=20pyproject.toml:=20Fix=20?=
 =?UTF-8?q?description?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3c02d33..9dabb41 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ authors = [
     {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
     {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
 ]
-description = "The OCR evaluation tool"
+description = "An OCR evaluation tool"
 readme = "README.md"
 license.file = "LICENSE"
 requires-python = ">=3.9"

From 64444dd419c7f758ee7ebb42db3746ee016fab7a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 17 Apr 2025 16:08:41 +0200
Subject: [PATCH 146/176] opt out of 7f8a8dd5 (uniseg update that requires
 py39)

---
 requirements.txt                    | 2 +-
 src/dinglehopper/word_error_rate.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 123187b..653ec59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg >= 0.9.1
+uniseg >= 0.8.0
 numpy
 colorama
 MarkupSafe
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index ec039b3..f2db504 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -21,10 +21,15 @@ def patch_word_break():
     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
     """
     old_word_break = uniseg.wordbreak.word_break
+    if hasattr(uniseg.wordbreak, 'Word_Break'):
+        aletter = uniseg.wordbreak.Word_Break.ALetter
+    else:
+        # uniseg<0.9
+        aletter = uniseg.wordbreak.WordBreak.ALETTER
 
     def new_word_break(c):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return uniseg.wordbreak.Word_Break.ALetter
+            return aletter
         else:
             return old_word_break(c)
 

From ea33602336f063e68002dbd73e03d617c74dc7e2 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 17 Apr 2025 16:09:42 +0200
Subject: [PATCH 147/176] CI: reactivate py38

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 277d4ba..387f7a2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
 
     runs-on: "ubuntu-latest"
 

From a24623b966911040b951d6763e22d7da1d750b90 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 17 Apr 2025 16:47:13 +0200
Subject: [PATCH 148/176] re-allow py38

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9dabb41..62fae82 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 description = "An OCR evaluation tool"
 readme = "README.md"
 license.file = "LICENSE"
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
 
 dynamic = ["version", "dependencies", "optional-dependencies"]

From 817e0c95f7537ad3c219118b50d39769b2f353a7 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 10:32:29 +0200
Subject: [PATCH 149/176] =?UTF-8?q?=F0=9F=93=A6=20v0.10.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index f63392a..43795e1 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.10.0",
+  "version": "0.10.1",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "dockerhub": "ocrd/dinglehopper",
   "tools": {

From 6bf5bd71780f78f3cef4468628886922c2dbe3c3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 11:48:44 +0200
Subject: [PATCH 150/176] =?UTF-8?q?=F0=9F=90=9B=20Fix=20vendor=20strings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile | 2 +-
 LICENSE    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e497d16..c9b5523 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ LABEL \
     org.label-schema.vcs-ref=$VCS_REF \
     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
     org.label-schema.build-date=$BUILD_DATE \
-    org.opencontainers.image.vendor="qurator" \
+    org.opencontainers.image.vendor="Staatsbibliothek zu Berlin — SPK" \
     org.opencontainers.image.title="dinglehopper" \
     org.opencontainers.image.description="An OCR evaluation tool" \
     org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
diff --git a/LICENSE b/LICENSE
index 9b7a833..221c706 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2019 qurator
+   Copyright 2019-2025 Staatsbibliothek zu Berlin — SPK
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

From d8403421fcf583f3941776659651e2f41663a4ef Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 12:30:47 +0200
Subject: [PATCH 151/176] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 403658e..c7e6782 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.5
+    rev: v0.11.6
     hooks:
     -   args:
         -   --fix

From 4024e350f7f5379bfffe81d45ba31bf376a4f4db Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 12:32:07 +0100
Subject: [PATCH 152/176] =?UTF-8?q?=F0=9F=9A=A7=20Test=20new=20flexible=20?=
 =?UTF-8?q?line=20dirs=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/line_dirs_test.py            | 148 ++++++++++++++++++
 .../line_dirs_test/basic/gt/a.gt.txt          |   1 +
 .../line_dirs_test/basic/gt/b.gt.txt          |   1 +
 .../line_dirs_test/basic/ocr/a.some-ocr.txt   |   1 +
 .../line_dirs_test/basic/ocr/b.some-ocr.txt   |   1 +
 .../line_dirs_test/merged/a/a.dummy.jpg       |   0
 .../line_dirs_test/merged/a/a.gt.txt          |   1 +
 .../line_dirs_test/merged/a/a.some-ocr.txt    |   1 +
 .../line_dirs_test/merged/b/b.dummy.jpg       |   0
 .../line_dirs_test/merged/b/b.gt.txt          |   1 +
 .../line_dirs_test/merged/b/b.some-ocr.txt    |   1 +
 .../line_dirs_test/subdirs/gt/a/a.gt.txt      |   1 +
 .../line_dirs_test/subdirs/gt/b/b.gt.txt      |   1 +
 .../subdirs/ocr/a/a.some-ocr.txt              |   1 +
 .../subdirs/ocr/b/b.some-ocr.txt              |   1 +
 15 files changed, 160 insertions(+)
 create mode 100644 src/dinglehopper/line_dirs_test.py
 create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg
 create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg
 create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt

diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py
new file mode 100644
index 0000000..676fe22
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test.py
@@ -0,0 +1,148 @@
+import os.path
+import itertools
+from typing import Iterator, Tuple
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
+def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
+
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
+            + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn
+
+def all_equal(iterable):
+    g = itertools.groupby(iterable)
+    return next(g, True) and not next(g, False)
+
+def common_prefix(its):
+    return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
+
+
+def common_suffix(its):
+    return reversed(common_prefix(reversed(it) for it in its))
+
+
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
+
+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+
+
+def test_basic():
+    """Test the dumb method: User gives directories and suffixes."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            "line_dirs_test/basic/gt",
+            ".gt.txt",
+            "line_dirs_test/basic/ocr",
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+def test_basic_autodetect():
+    """Test the autodetect method: User gives directories, suffixes are autodetected if possible"""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            "line_dirs_test/basic/gt",
+            "line_dirs_test/basic/ocr",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs():
+    """Test the dumb method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            "line_dirs_test/subdirs/gt",
+            ".gt.txt",
+            "line_dirs_test/subdirs/ocr",
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs_autodetect():
+    """Test the autodetect method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            "line_dirs_test/subdirs/gt",
+            "line_dirs_test/subdirs/ocr",
+        )
+    )
+
+    assert len(pairs) == 2
+
+def test_merged():
+    """Test the dumb method: Should also work when GT and OCR texts are in the same directories."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            "line_dirs_test/merged",
+            ".gt.txt",
+            "line_dirs_test/merged",
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+if __name__ == "__main__":
+    test_basic()
+    test_subdirs()
+    test_merged()
+
+    test_basic_autodetect()
+    test_subdirs_autodetect()
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
new file mode 100644
index 0000000..484ba93
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
@@ -0,0 +1 @@
+This is a test.
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
new file mode 100644
index 0000000..fc9bd6a
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
@@ -0,0 +1 @@
+Another test.
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
new file mode 100644
index 0000000..27cf4bf
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
@@ -0,0 +1 @@
+Tis is a test.
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
new file mode 100644
index 0000000..0bc0e40
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
@@ -0,0 +1 @@
+AnÖther test.
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg
new file mode 100644
index 0000000..e69de29
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
new file mode 100644
index 0000000..484ba93
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
@@ -0,0 +1 @@
+This is a test.
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
new file mode 100644
index 0000000..27cf4bf
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
@@ -0,0 +1 @@
+Tis is a test.
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg
new file mode 100644
index 0000000..e69de29
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
new file mode 100644
index 0000000..fc9bd6a
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
@@ -0,0 +1 @@
+Another test.
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
new file mode 100644
index 0000000..0bc0e40
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
@@ -0,0 +1 @@
+AnÖther test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
new file mode 100644
index 0000000..484ba93
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
@@ -0,0 +1 @@
+This is a test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
new file mode 100644
index 0000000..fc9bd6a
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
@@ -0,0 +1 @@
+Another test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
new file mode 100644
index 0000000..27cf4bf
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
@@ -0,0 +1 @@
+Tis is a test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt
new file mode 100644
index 0000000..0bc0e40
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt
@@ -0,0 +1 @@
+AnÖther test.

From ad8e6de36bf376a830af29e31cefa43066e5baff Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 12:34:08 +0100
Subject: [PATCH 153/176] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?=
 =?UTF-8?q?=20character=20diff=20reports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 03bf374..01fd585 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -81,7 +81,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             joiner="",
             none="·",
             score_hint=score_hint(l_cer, l_n_characters),
-        )
+        )[0]
         word_diff_report += gen_diff_report(
             gt_words,
             ocr_words,
@@ -89,7 +89,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             joiner=" ",
             none="⋯",
             score_hint=score_hint(l_wer, l_n_words),
-        )
+        )[0]
 
     env = Environment(
         loader=FileSystemLoader(

From 2bf2529c380f028e59953584aa2aa26dc3a828b5 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 12:50:14 +0100
Subject: [PATCH 154/176] =?UTF-8?q?=F0=9F=9A=A7=20Port=20new=20line=20dir?=
 =?UTF-8?q?=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py  | 83 +++++++++++++++++++++++++-----
 src/dinglehopper/line_dirs_test.py | 71 -------------------------
 2 files changed, 69 insertions(+), 85 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 01fd585..43e4f1a 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,5 +1,6 @@
 import itertools
 import os
+from typing import Iterator, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -12,11 +13,36 @@ from .ocr_files import plain_extract
 from .word_error_rate import word_error_rate_n, words_normalized
 
 
+def removesuffix(text, suffix):
+    if suffix and text.endswith(suffix):
+        return text[: -len(suffix)]
+    return text
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
 def all_equal(iterable):
     g = itertools.groupby(iterable)
     return next(g, True) and not next(g, False)
 
-
 def common_prefix(its):
     return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
 
@@ -24,16 +50,49 @@ def common_prefix(its):
 def common_suffix(its):
     return reversed(common_prefix(reversed(it) for it in its))
 
+def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
 
-def removesuffix(text, suffix):
-    if suffix and text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
+            + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn
+
+
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
+
+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
 def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
-    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
-    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
 
     cer = None
     n_characters = None
@@ -42,14 +101,10 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     n_words = None
     word_diff_report = ""
 
-    for k, gt in enumerate(os.listdir(gt_dir)):
-        # Find a match by replacing the suffix
-        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+    for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)):
 
-        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
-        ocr_text = plain_extract(
-            os.path.join(ocr_dir, ocr), include_filename_in_id=True
-        )
+        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
+        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
         gt_words = words_normalized(gt_text)
         ocr_words = words_normalized(ocr_text)
 
diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py
index 676fe22..9827f01 100644
--- a/src/dinglehopper/line_dirs_test.py
+++ b/src/dinglehopper/line_dirs_test.py
@@ -2,78 +2,7 @@ import os.path
 import itertools
 from typing import Iterator, Tuple
 
-def is_hidden(filepath):
-    filename = os.path.basename(os.path.abspath(filepath))
-    return filename.startswith(".")
 
-def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
-    """
-    Find all files in dir_, returning filenames
-
-    If pred is given, pred(filename) must be True for the filename.
-
-    Does not return hidden files by default.
-    """
-    for root, _, filenames in os.walk(dir_):
-        for fn in filenames:
-            if not return_hidden and is_hidden(fn):
-                continue
-            if pred and not pred(fn):
-                continue
-            yield os.path.join(root, fn)
-
-
-def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
-    """
-    Find GT files and matching OCR files.
-
-    Returns pairs of GT and OCR files.
-    """
-    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
-        ocr_fn = os.path.join(
-            ocr_dir,
-            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
-            + ocr_suffix,
-        )
-        if not os.path.exists(ocr_fn):
-            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
-
-        yield gt_fn, ocr_fn
-
-def all_equal(iterable):
-    g = itertools.groupby(iterable)
-    return next(g, True) and not next(g, False)
-
-def common_prefix(its):
-    return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
-
-
-def common_suffix(its):
-    return reversed(common_prefix(reversed(it) for it in its))
-
-
-def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
-    """
-    Find GT files and matching OCR files, autodetect suffixes.
-
-    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
-    files with a common suffix. Currently the files must have a suffix, e.g.
-    ".gt.txt" (e.g. ".ocr.txt").
-
-    Returns pairs of GT and OCR files.
-    """
-
-    # Autodetect suffixes
-    gt_files = find_all_files(gt_dir)
-    gt_suffix = "".join(common_suffix(gt_files))
-    if len(gt_suffix) == 0:
-        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
-    ocr_files = find_all_files(ocr_dir)
-    ocr_suffix = "".join(common_suffix(ocr_files))
-    if len(ocr_suffix) == 0:
-        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
-
-    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
 def test_basic():

From 6980d7a2526380833ffd4d964e1f1b4c58bfed8a Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 13:21:49 +0100
Subject: [PATCH 155/176] =?UTF-8?q?=F0=9F=9A=A7=20Use=20our=20own=20remove?=
 =?UTF-8?q?suffix()=20as=20we=20still=20support=20Python=203.8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 43e4f1a..30b2be1 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -14,6 +14,11 @@ from .word_error_rate import word_error_rate_n, words_normalized
 
 
 def removesuffix(text, suffix):
+    """
+    Remove suffix from text.
+
+    Can be replaced with str.removesuffix when we only support Python >= 3.9.
+    """
     if suffix and text.endswith(suffix):
         return text[: -len(suffix)]
     return text
@@ -59,7 +64,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu
     for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
         ocr_fn = os.path.join(
             ocr_dir,
-            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix)
             + ocr_suffix,
         )
         if not os.path.exists(ocr_fn):

From 73ee16fe5181c29a06f7460ed1fb1dadd84d6cc2 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 13:59:55 +0100
Subject: [PATCH 156/176] =?UTF-8?q?=F0=9F=9A=A7=20Support=20'merged'=20GT+?=
 =?UTF-8?q?OCR=20line=20directories?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 30b2be1..44305d6 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
     yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
+def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None):
 
     cer = None
     n_characters = None
@@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     n_words = None
     word_diff_report = ""
 
-    for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)):
+    if gt_suffix is not None and ocr_suffix is not None:
+        gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+    else:
+        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
 
+    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
         gt_text = plain_extract(gt_fn, include_filename_in_id=True)
         ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
         gt_words = words_normalized(gt_text)
@@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 @click.option(
     "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
-def main(gt, ocr, report_prefix, metrics):
+@click.option("--gt-suffix", help="Suffix of GT line text files")
+@click.option("--ocr-suffix", help="Suffix of OCR line text files")
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
     """
     Compare the GT line text directory against the OCR line text directory.
 
     This assumes that the GT line text directory contains textfiles with a common
     suffix like ".gt.txt", and the OCR line text directory contains textfiles with
     a common suffix like ".some-ocr.txt". The text files also need to be paired,
-    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
-    in the OCT lines directory.
+    i.e. the GT filename "line001.gt.txt" needs to match a filename
+    "line001.some-ocr.txt" in the OCR lines directory.
 
-    The GT and OCR directories are usually round truth line texts and the results of
+    GT and OCR directories may contain line text files in matching subdirectories,
+    e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
+
+    GT and OCR directories can also be the same directory, but in this case you need
+    to give --gt-suffix and --ocr-suffix explicitly.
+
+    The GT and OCR directories are usually ground truth line texts and the results of
     an OCR software, but you may use dinglehopper to compare two OCR results. In
     that case, use --no-metrics to disable the then meaningless metrics and also
     change the color scheme from green/red to blue.
@@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics):
 
     """
     initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics)
+    process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix)
 
 
 if __name__ == "__main__":

From 68344e48f870968a92c6c51afb759c1fa47dea2b Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 14:49:40 +0100
Subject: [PATCH 157/176] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20cli=5Fline=5F?=
 =?UTF-8?q?dirs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 34 +++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 44305d6..9e806a1 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -23,11 +23,13 @@ def removesuffix(text, suffix):
         return text[: -len(suffix)]
     return text
 
+
 def is_hidden(filepath):
     filename = os.path.basename(os.path.abspath(filepath))
     return filename.startswith(".")
 
-def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
+
+def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]:
     """
     Find all files in dir_, returning filenames
 
@@ -48,6 +50,7 @@ def all_equal(iterable):
     g = itertools.groupby(iterable)
     return next(g, True) and not next(g, False)
 
+
 def common_prefix(its):
     return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
 
@@ -55,7 +58,10 @@ def common_prefix(its):
 def common_suffix(its):
     return reversed(common_prefix(reversed(it) for it in its))
 
-def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
+
+def find_gt_and_ocr_files(
+    gt_dir, gt_suffix, ocr_dir, ocr_suffix
+) -> Iterator[Tuple[str, str]]:
     """
     Find GT files and matching OCR files.
 
@@ -64,8 +70,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu
     for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
         ocr_fn = os.path.join(
             ocr_dir,
-            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix)
-            + ocr_suffix,
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
         )
         if not os.path.exists(ocr_fn):
             raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
@@ -88,16 +93,22 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
     gt_files = find_all_files(gt_dir)
     gt_suffix = "".join(common_suffix(gt_files))
     if len(gt_suffix) == 0:
-        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
+        raise RuntimeError(
+            f"Files in GT directory {gt_dir} do not have a common suffix"
+        )
     ocr_files = find_all_files(ocr_dir)
     ocr_suffix = "".join(common_suffix(ocr_files))
     if len(ocr_suffix) == 0:
-        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
+        raise RuntimeError(
+            f"Files in OCR directory {ocr_dir} do not have a common suffix"
+        )
 
     yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None):
+def process(
+    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+):
 
     cer = None
     n_characters = None
@@ -216,7 +227,14 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
 
     """
     initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix)
+    process(
+        gt,
+        ocr,
+        report_prefix,
+        metrics=metrics,
+        gt_suffix=gt_suffix,
+        ocr_suffix=ocr_suffix,
+    )
 
 
 if __name__ == "__main__":

From 9414a92f9f31760a694c44f06069f7677e679078 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 15:19:37 +0100
Subject: [PATCH 158/176] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Typ?=
 =?UTF-8?q?e-annotate=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 9e806a1..2cd4fe6 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,6 +1,6 @@
 import itertools
 import os
-from typing import Iterator, Tuple
+from typing import Callable, Iterator, Optional, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -29,7 +29,9 @@ def is_hidden(filepath):
     return filename.startswith(".")
 
 
-def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]:
+def find_all_files(
+    dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
+) -> Iterator[str]:
     """
     Find all files in dir_, returning filenames
 
@@ -60,7 +62,7 @@ def common_suffix(its):
 
 
 def find_gt_and_ocr_files(
-    gt_dir, gt_suffix, ocr_dir, ocr_suffix
+    gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
 ) -> Iterator[Tuple[str, str]]:
     """
     Find GT files and matching OCR files.

From c37316da097d18b74f0da2398b53b64ab712495f Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 19:57:12 +0100
Subject: [PATCH 159/176] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?=
 =?UTF-8?q?=20word=20differences=20section?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At the time of generation of the section, the {gt,ocr}_words generators
were drained. Fix by using a list.

Fixes gh-124.
---
 src/dinglehopper/cli_line_dirs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 2cd4fe6..2861d6f 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,6 +1,6 @@
 import itertools
 import os
-from typing import Callable, Iterator, Optional, Tuple
+from typing import Callable, Iterator, Optional, Tuple, List
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -127,8 +127,8 @@ def process(
     for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
         gt_text = plain_extract(gt_fn, include_filename_in_id=True)
         ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
-        gt_words = words_normalized(gt_text)
-        ocr_words = words_normalized(ocr_text)
+        gt_words: List[str] = list(words_normalized(gt_text))
+        ocr_words: List[str] = list(words_normalized(ocr_text))
 
         # Compute CER
         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)

From 322faeb26c2c60d8d777ab6132b9af397d0fd510 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 09:21:09 +0100
Subject: [PATCH 160/176] =?UTF-8?q?=F0=9F=8E=A8=20Sort=20imports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 2861d6f..5cd1bfa 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,6 +1,6 @@
 import itertools
 import os
-from typing import Callable, Iterator, Optional, Tuple, List
+from typing import Callable, Iterator, List, Optional, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader

From 3b16c14c16dd00500574b74031107768d5cbb465 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 09:50:24 +0100
Subject: [PATCH 161/176] =?UTF-8?q?=E2=9C=94=20=20Properly=20test=20line?=
 =?UTF-8?q?=20dir=20finding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |  1 +
 .../data/line_dirs}/basic/gt/a.gt.txt         |  0
 .../data/line_dirs}/basic/gt/b.gt.txt         |  0
 .../data/line_dirs}/basic/ocr/a.some-ocr.txt  |  0
 .../data/line_dirs}/basic/ocr/b.some-ocr.txt  |  0
 .../data/line_dirs}/merged/a/a.dummy.jpg      |  0
 .../data/line_dirs}/merged/a/a.gt.txt         |  0
 .../data/line_dirs}/merged/a/a.some-ocr.txt   |  0
 .../data/line_dirs}/merged/b/b.dummy.jpg      |  0
 .../data/line_dirs}/merged/b/b.gt.txt         |  0
 .../data/line_dirs}/merged/b/b.some-ocr.txt   |  0
 .../data/line_dirs}/subdirs/gt/a/a.gt.txt     |  0
 .../data/line_dirs}/subdirs/gt/b/b.gt.txt     |  0
 .../line_dirs}/subdirs/ocr/a/a.some-ocr.txt   |  0
 .../line_dirs}/subdirs/ocr/b/b.some-ocr.txt   |  0
 .../test_line_dirs.py}                        | 40 ++++++++-----------
 16 files changed, 18 insertions(+), 23 deletions(-)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/a.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/b.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/a.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/b.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.dummy.jpg (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.dummy.jpg (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/a/a.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/b/b.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/a/a.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/b/b.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test.py => tests/test_line_dirs.py} (52%)

diff --git a/.gitignore b/.gitignore
index d931831..66d66bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ dmypy.json
 
 # User-specific stuff
 .idea
+.*.swp
 
 # Build artifacts
 /build
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg
rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg
rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/tests/test_line_dirs.py
similarity index 52%
rename from src/dinglehopper/line_dirs_test.py
rename to src/dinglehopper/tests/test_line_dirs.py
index 9827f01..03966e1 100644
--- a/src/dinglehopper/line_dirs_test.py
+++ b/src/dinglehopper/tests/test_line_dirs.py
@@ -1,29 +1,30 @@
-import os.path
-import itertools
-from typing import Iterator, Tuple
+import os
 
+from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect
 
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 
 
 def test_basic():
     """Test the dumb method: User gives directories and suffixes."""
     pairs = list(
         find_gt_and_ocr_files(
-            "line_dirs_test/basic/gt",
+            os.path.join(data_dir, "line_dirs/basic/gt"),
             ".gt.txt",
-            "line_dirs_test/basic/ocr",
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
             ".some-ocr.txt",
         )
     )
 
     assert len(pairs) == 2
 
+
 def test_basic_autodetect():
-    """Test the autodetect method: User gives directories, suffixes are autodetected if possible"""
+    """Test autodetect: User gives directories, suffixes are autodetected if possible"""
     pairs = list(
         find_gt_and_ocr_files_autodetect(
-            "line_dirs_test/basic/gt",
-            "line_dirs_test/basic/ocr",
+            os.path.join(data_dir, "line_dirs/basic/gt"),
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
         )
     )
 
@@ -34,9 +35,9 @@ def test_subdirs():
     """Test the dumb method: Should also work when subdirectories are involved."""
     pairs = list(
         find_gt_and_ocr_files(
-            "line_dirs_test/subdirs/gt",
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
             ".gt.txt",
-            "line_dirs_test/subdirs/ocr",
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
             ".some-ocr.txt",
         )
     )
@@ -48,30 +49,23 @@ def test_subdirs_autodetect():
     """Test the autodetect method: Should also work when subdirectories are involved."""
     pairs = list(
         find_gt_and_ocr_files_autodetect(
-            "line_dirs_test/subdirs/gt",
-            "line_dirs_test/subdirs/ocr",
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
         )
     )
 
     assert len(pairs) == 2
 
+
 def test_merged():
-    """Test the dumb method: Should also work when GT and OCR texts are in the same directories."""
+    """Test the dumb method: GT and OCR texts are in the same directories."""
     pairs = list(
         find_gt_and_ocr_files(
-            "line_dirs_test/merged",
+            os.path.join(data_dir, "line_dirs/merged"),
             ".gt.txt",
-            "line_dirs_test/merged",
+            os.path.join(data_dir, "line_dirs/merged"),
             ".some-ocr.txt",
         )
     )
 
     assert len(pairs) == 2
-
-if __name__ == "__main__":
-    test_basic()
-    test_subdirs()
-    test_merged()
-
-    test_basic_autodetect()
-    test_subdirs_autodetect()

From f1a586cff1d306d3fbef95c8110af74d3941a894 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 10:36:58 +0100
Subject: [PATCH 162/176] =?UTF-8?q?=E2=9C=94=20=20Test=20line=20dirs=20CLI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_integ_cli_line_dirs.py         | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 src/dinglehopper/tests/test_integ_cli_line_dirs.py

diff --git a/src/dinglehopper/tests/test_integ_cli_line_dirs.py b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
new file mode 100644
index 0000000..90cbabf
--- /dev/null
+++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
@@ -0,0 +1,61 @@
+import json
+import os.path
+import re
+
+import pytest
+
+from ..cli_line_dirs import process
+from .util import working_directory
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic_report_diff(tmp_path):
+    """Test that the cli/process() produces a report wiff char+word diff"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+
+    # Counting GT lines in the diff
+    assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2
+    assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_merged(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/merged")
+        ocr_dir = os.path.join(data_dir, "line_dirs/merged")
+        process(
+            gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt"
+        )
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)

From 480b3cf864ba1ba5c26ed550760b53193b91e93d Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 11:14:07 +0100
Subject: [PATCH 163/176] =?UTF-8?q?=E2=9C=94=20=20Test=20that=20CLI=20prod?=
 =?UTF-8?q?uces=20a=20complete=20HTML=20report?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...json.py => test_integ_cli_valid_report.py} | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 rename src/dinglehopper/tests/{test_integ_cli_valid_json.py => test_integ_cli_valid_report.py} (64%)

diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_report.py
similarity index 64%
rename from src/dinglehopper/tests/test_integ_cli_valid_json.py
rename to src/dinglehopper/tests/test_integ_cli_valid_report.py
index 6cbfa0c..fed0d28 100644
--- a/src/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py
@@ -1,4 +1,5 @@
 import json
+import re
 
 import pytest
 
@@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path):
         with open("report.json", "r") as jsonf:
             j = json.load(jsonf)
             assert j["cer"] == pytest.approx(float("inf"))
+
+
+@pytest.mark.integration
+def test_cli_html(tmp_path):
+    """Test that the cli/process() yields complete HTML report"""
+
+    with working_directory(tmp_path):
+        with open("gt.txt", "w") as gtf:
+            gtf.write("AAAAA")
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("AAAAB")
+
+        process("gt.txt", "ocr.txt", "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+            print(html_report)
+
+        assert re.search(r"CER: 0\.\d+", html_report)
+        assert re.search(r"WER: 1\.0", html_report)
+        assert len(re.findall("gt.*cdiff", html_report)) == 1
+        assert len(re.findall("gt.*wdiff", html_report)) == 1

From cf59b951a3a30cd23e36a0bb2e553f2d6abcee20 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 4 Feb 2025 13:54:28 +0100
Subject: [PATCH 164/176] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20tex?=
 =?UTF-8?q?t=20encoding=20to=20line=20dir=20cli?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 27 +++++++++++++++++++++++----
 src/dinglehopper/ocr_files.py     | 22 ++++++++++++++++------
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 5cd1bfa..4064de0 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
 
 
 def process(
-    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
 ):
 
     cer = None
@@ -125,8 +132,12 @@ def process(
         gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
 
     for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
-        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
-        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
+        ocr_text = plain_extract(
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
         gt_words: List[str] = list(words_normalized(gt_text))
         ocr_words: List[str] = list(words_normalized(ocr_text))
 
@@ -202,7 +213,12 @@ def process(
 )
 @click.option("--gt-suffix", help="Suffix of GT line text files")
 @click.option("--ocr-suffix", help="Suffix of OCR line text files")
-def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding  (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
     """
     Compare the GT line text directory against the OCR line text directory.
 
@@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
     $REPORT_PREFIX defaults to "report". The reports include the character error
     rate (CER) and the word error rate (WER).
 
+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
     """
     initLogging()
     process(
@@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
         metrics=metrics,
         gt_suffix=gt_suffix,
         ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
     )
 
 
diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 1593f44..1eecebb 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
+log = getLogger("processor.OcrdDinglehopperEvaluate")
+
 
 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
     """Return the ALTO namespace used in the given ElementTree.
@@ -149,7 +152,7 @@ def detect_encoding(filename):
     return chardet.detect(open(filename, "rb").read(1024))["encoding"]
 
 
-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
     id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
 
     def make_segment(no, line):
@@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
             clusters,
         )
 
-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
+        fileencoding = detect_encoding(filename)
+        log.warn(
+            f"Autodetected encoding as '{fileencoding}'"
+            ", it is recommended to specify it explicitly with --plain-encoding"
+        )
+    else:
+        fileencoding = encoding
     with open(filename, "r", encoding=fileencoding) as f:
         return ExtractedText(
             None,
@@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
     # XXX hardcoded SBB normalization
 
 
-def plain_text(filename):
-    return plain_extract(filename).text
+def plain_text(filename, encoding="autodetect"):
+    return plain_extract(filename, encoding=encoding).text
 
 
-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
     """Extract the text from the given file.
 
     Supports PAGE, ALTO and falls back to plain text.
@@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
     try:
         tree = ET.parse(filename)
     except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
     try:
         return page_extract(tree, textequiv_level=textequiv_level)
     except ValueError:

From 5578ce83a3600bbe6f6a0a2679f2b35c90b34fe4 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 13 Feb 2025 16:39:29 +0100
Subject: [PATCH 165/176] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20tex?=
 =?UTF-8?q?t=20encoding=20to=20line=20dir=20cli?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index b67e9cc..5e5e81c 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -114,6 +114,7 @@ def process(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
     """Check OCR result against GT.
 
@@ -121,8 +122,12 @@ def process(
     this undecorated version and use Click on a wrapper.
     """
 
-    gt_text = extract(gt, textequiv_level=textequiv_level)
-    ocr_text = extract(ocr, textequiv_level=textequiv_level)
+    gt_text = extract(
+        gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
+    ocr_text = extract(
+        ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
     gt_words: List[str] = list(words_normalized(gt_text))
     ocr_words: List[str] = list(words_normalized(ocr_text))
 
@@ -195,6 +200,7 @@ def process_dir(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
     for gt_file in os.listdir(gt):
         gt_file_path = os.path.join(gt, gt_file)
@@ -209,6 +215,7 @@ def process_dir(
                 metrics=metrics,
                 differences=differences,
                 textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
             )
         else:
             print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@@ -233,6 +240,11 @@ def process_dir(
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding  (e.g. "utf-8") of plain text files',
+)
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
 @click.version_option()
 def main(
@@ -243,6 +255,7 @@ def main(
     metrics,
     differences,
     textequiv_level,
+    plain_encoding,
     progress,
 ):
     """
@@ -280,6 +293,7 @@ def main(
                 metrics=metrics,
                 differences=differences,
                 textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
             )
     else:
         process(
@@ -290,6 +304,7 @@ def main(
             metrics=metrics,
             differences=differences,
             textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
         )
 
 

From 9db5b4caf5b6335066e121a231cee1b1298bfbfa Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 13 Feb 2025 16:48:50 +0100
Subject: [PATCH 166/176] =?UTF-8?q?=F0=9F=9A=A7=20Add=20OCR-D=20parameter?=
 =?UTF-8?q?=20for=20plain=20text=20encoding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 5 +++++
 src/dinglehopper/ocrd_cli.py    | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 43795e1..ae7c9bb 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -25,6 +25,11 @@
           "enum": ["region", "line"],
           "default": "region",
           "description": "PAGE XML hierarchy level to extract the text from"
+        },
+        "plain_encoding": {
+          "type": "string",
+          "default": "autodetect",
+          "description": "Encoding (e.g. \"utf-8\") of plain text files"
         }
       }
     }
diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index fa4747f..2d7da8e 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -26,6 +26,7 @@ class OcrdDinglehopperEvaluate(Processor):
         assert self.parameter
         metrics = self.parameter["metrics"]
         textequiv_level = self.parameter["textequiv_level"]
+        plain_encoding = self.parameter["plain_encoding"]
 
         # wrong number of inputs: let fail
         gt_file, ocr_file = input_files
@@ -52,6 +53,7 @@ class OcrdDinglehopperEvaluate(Processor):
             self.output_file_grp,
             metrics=metrics,
             textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
         )
 
         # Add reports to the workspace

From 224aa02163b5ba28a4f44569b4cbb04d0dae4188 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 13 Feb 2025 16:50:21 +0100
Subject: [PATCH 167/176] =?UTF-8?q?=F0=9F=9A=A7=20Fix=20help=20text?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py           | 2 +-
 src/dinglehopper/cli_line_dirs.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index 5e5e81c..2d3c075 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -243,7 +243,7 @@ def process_dir(
 @click.option(
     "--plain-encoding",
     default="autodetect",
-    help='Encoding  (e.g. "utf-8") of plain text files',
+    help='Encoding (e.g. "utf-8") of plain text files',
 )
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
 @click.version_option()
diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 4064de0..0160f87 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -216,7 +216,7 @@ def process(
 @click.option(
     "--plain-encoding",
     default="autodetect",
-    help='Encoding  (e.g. "utf-8") of plain text files',
+    help='Encoding (e.g. "utf-8") of plain text files',
 )
 def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
     """

From a70260c10edbff774fcae1d3f636b2b5e806d4ae Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 13:56:13 +0200
Subject: [PATCH 168/176] =?UTF-8?q?=F0=9F=90=9B=20Use=20warning()=20to=20f?=
 =?UTF-8?q?ix=20DeprecationWarning?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocr_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 1eecebb..fdcaf54 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -168,7 +168,7 @@ def plain_extract(filename, include_filename_in_id=False, encoding="autodetect")
 
     if encoding == "autodetect":
         fileencoding = detect_encoding(filename)
-        log.warn(
+        log.warning(
             f"Autodetected encoding as '{fileencoding}'"
             ", it is recommended to specify it explicitly with --plain-encoding"
         )

From 14a4bc56d85bd953153bf64bcb95a92413814efb Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 18:24:35 +0200
Subject: [PATCH 169/176] =?UTF-8?q?=F0=9F=90=9B=20Add=20--plain-encoding?=
 =?UTF-8?q?=20option=20to=20dinglehopper-extract?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_extract.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py
index 9c51d34..5fce032 100644
--- a/src/dinglehopper/cli_extract.py
+++ b/src/dinglehopper/cli_extract.py
@@ -12,7 +12,12 @@ from .ocr_files import extract
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
-def main(input_file, textequiv_level):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(input_file, textequiv_level, plain_encoding):
     """
     Extract the text of the given INPUT_FILE.
 
@@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
     use "--textequiv-level line" to extract from the level of TextLine tags.
     """
     initLogging()
-    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    input_text = extract(
+        input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    ).text
     print(input_text)
 
 

From 9fc8937324b8ba2c94ddd865fb8c05fa5f92c49d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 24 Apr 2025 15:13:19 +0200
Subject: [PATCH 170/176] =?UTF-8?q?=E2=9C=92=20=20README:=20Mention=20ding?=
 =?UTF-8?q?lehopper-line-dirs=20--help?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 76fcc5a..a40db79 100644
--- a/README.md
+++ b/README.md
@@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:
 
-~~~
+```
 dinglehopper-line-dirs gt/ ocr/
-~~~
+```
+
+The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
+directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
+in this case.
 
 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on

From 5639f3db7f12647694c4ef03437af00227f45f58 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 24 Apr 2025 16:44:29 +0200
Subject: [PATCH 171/176] =?UTF-8?q?=E2=9C=94=20=20Add=20a=20tests=20that?=
 =?UTF-8?q?=20checks=20if=20plain=20text=20files=20with=20BOM=20are=20read?=
 =?UTF-8?q?=20correctly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/tests/test_ocr_files.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py
index 342507a..0c2a500 100644
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@@ -182,3 +182,15 @@ def test_plain(tmp_path):
         result = plain_text("ocr.txt")
         expected = "First, a line.\nAnd a second line."
         assert result == expected
+
+
+def test_plain_BOM(tmp_path):
+    """Test that plain text files with BOM are read correctly."""
+    BOM = "\ufeff"
+    with working_directory(tmp_path):
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write(BOM + "First, a line.\nAnd a second line.\n")
+
+        result = plain_text("ocr.txt")
+        expected = "First, a line.\nAnd a second line."
+        assert result == expected

From 628594ef98df634f3c411c780a4bccd26bb07526 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 24 Apr 2025 17:14:44 +0200
Subject: [PATCH 172/176] =?UTF-8?q?=F0=9F=93=A6=20v0.11.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 43795e1..6fad45a 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.10.1",
+  "version": "0.11.0",
   "git_url": "https://github.com/qurator-spk/dinglehopper",
   "dockerhub": "ocrd/dinglehopper",
   "tools": {

From 1ebb004386501986562aa0b927f543d9dfa6068c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 25 Apr 2025 10:13:06 +0200
Subject: [PATCH 173/176] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c7e6782..345060d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     -   id: black
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.6
+    rev: v0.11.7
     hooks:
     -   args:
         -   --fix

From 774790c36f7e1477d383bfb0f1771dc523953524 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 25 Apr 2025 11:20:00 +0200
Subject: [PATCH 174/176] =?UTF-8?q?=E2=9C=94=20=20GitHub=20Actions:=20Make?=
 =?UTF-8?q?=20reporting=20results=20clearer?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the "Actions" tab on GitHub, the workflow run that would post test results to the
_original_ workflow run is named "Test Report". This would lead me to click on it to see
the results, just to be disappointed.

This aims to make the naming of the GitHub workflows/jobs clearer.
---
 .github/workflows/test.yml        | 2 +-
 .github/workflows/test_report.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 387f7a2..db089d0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,4 +1,4 @@
-name: Test
+name: 'Test'
 
 on:
 
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
index 26f411b..5579d8c 100644
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@@ -1,4 +1,4 @@
-name: 'Test Report'
+name: 'Test - Report results'
 on:
   workflow_run:
     workflows: ['test']
@@ -15,6 +15,6 @@ jobs:
       - uses: dorny/test-reporter@v1
         with:
           artifact: /test-results-(.*)/
-          name: 'Tests Results - $1'
+          name: 'test - Results ($1)'
           path: '*junit.xml'
           reporter: java-junit

From d09e3969f820c425d65e27d5b33baca9b191f9c1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 2 May 2025 00:18:38 +0200
Subject: [PATCH 175/176] docker: prepackage ocrd-all-module-dir.json

---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index c9b5523..7064efc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,6 +32,8 @@ COPY . .
 COPY ocrd-tool.json .
 # prepackage ocrd-tool.json as ocrd-all-tool.json
 RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+# prepackage ocrd-all-module-dir.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
 RUN make install && rm -rf /build/dinglehopper
 
 WORKDIR /data

From b1ef3af1a8725cd9053941542772b17b66a5cbe5 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 2 May 2025 00:18:35 +0200
Subject: [PATCH 176/176] docker: use latest core base stage

---
 Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 12f342a..3729311 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,9 @@ PIP = pip3
 PYTHONIOENCODING=utf8
 PYTEST_ARGS = -vv
 
-DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
-DOCKER_TAG = ocrd/dinglehopper
+DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest
+DOCKER_TAG ?= ocrd/dinglehopper
+DOCKER ?= docker
 
 help:
 	@echo
@@ -24,7 +25,7 @@ test:
 	pytest $(PYTEST_ARGS)
 
 docker:
-	docker build \
+	$(DOCKER) build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \