From 4024e350f7f5379bfffe81d45ba31bf376a4f4db Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 12:32:07 +0100
Subject: [PATCH 01/20] =?UTF-8?q?=F0=9F=9A=A7=20Test=20new=20flexible=20li?=
 =?UTF-8?q?ne=20dirs=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/line_dirs_test.py            | 148 ++++++++++++++++++
 .../line_dirs_test/basic/gt/a.gt.txt          |   1 +
 .../line_dirs_test/basic/gt/b.gt.txt          |   1 +
 .../line_dirs_test/basic/ocr/a.some-ocr.txt   |   1 +
 .../line_dirs_test/basic/ocr/b.some-ocr.txt   |   1 +
 .../line_dirs_test/merged/a/a.dummy.jpg       |   0
 .../line_dirs_test/merged/a/a.gt.txt          |   1 +
 .../line_dirs_test/merged/a/a.some-ocr.txt    |   1 +
 .../line_dirs_test/merged/b/b.dummy.jpg       |   0
 .../line_dirs_test/merged/b/b.gt.txt          |   1 +
 .../line_dirs_test/merged/b/b.some-ocr.txt    |   1 +
 .../line_dirs_test/subdirs/gt/a/a.gt.txt      |   1 +
 .../line_dirs_test/subdirs/gt/b/b.gt.txt      |   1 +
 .../subdirs/ocr/a/a.some-ocr.txt              |   1 +
 .../subdirs/ocr/b/b.some-ocr.txt              |   1 +
 15 files changed, 160 insertions(+)
 create mode 100644 src/dinglehopper/line_dirs_test.py
 create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg
 create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg
 create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
 create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt

diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py
new file mode 100644
index 0000000..676fe22
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test.py
@@ -0,0 +1,148 @@
+import os.path
+import itertools
+from typing import Iterator, Tuple
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
+def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
+
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
+            + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn
+
+def all_equal(iterable):
+    g = itertools.groupby(iterable)
+    return next(g, True) and not next(g, False)
+
+def common_prefix(its):
+    return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
+
+
+def common_suffix(its):
+    return reversed(common_prefix(reversed(it) for it in its))
+
+
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
+
+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+
+
+def test_basic():
+    """Test the dumb method: User gives directories and suffixes."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            "line_dirs_test/basic/gt",
+            ".gt.txt",
+            "line_dirs_test/basic/ocr",
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+def test_basic_autodetect():
+    """Test the autodetect method: User gives directories, suffixes are autodetected if possible"""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            "line_dirs_test/basic/gt",
+            "line_dirs_test/basic/ocr",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs():
+    """Test the dumb method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            "line_dirs_test/subdirs/gt",
+            ".gt.txt",
+            "line_dirs_test/subdirs/ocr",
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs_autodetect():
+    """Test the autodetect method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            "line_dirs_test/subdirs/gt",
+            "line_dirs_test/subdirs/ocr",
+        )
+    )
+
+    assert len(pairs) == 2
+
+def test_merged():
+    """Test the dumb method: Should also work when GT and OCR texts are in the same directories."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            "line_dirs_test/merged",
+            ".gt.txt",
+            "line_dirs_test/merged",
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+if __name__ == "__main__":
+    test_basic()
+    test_subdirs()
+    test_merged()
+
+    test_basic_autodetect()
+    test_subdirs_autodetect()
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
new file mode 100644
index 0000000..484ba93
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
@@ -0,0 +1 @@
+This is a test.
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
new file mode 100644
index 0000000..fc9bd6a
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
@@ -0,0 +1 @@
+Another test.
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
new file mode 100644
index 0000000..27cf4bf
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
@@ -0,0 +1 @@
+Tis is a test.
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
new file mode 100644
index 0000000..0bc0e40
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
@@ -0,0 +1 @@
+AnÖther test.
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg
new file mode 100644
index 0000000..e69de29
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
new file mode 100644
index 0000000..484ba93
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
@@ -0,0 +1 @@
+This is a test.
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
new file mode 100644
index 0000000..27cf4bf
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
@@ -0,0 +1 @@
+Tis is a test.
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg
new file mode 100644
index 0000000..e69de29
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
new file mode 100644
index 0000000..fc9bd6a
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
@@ -0,0 +1 @@
+Another test.
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
new file mode 100644
index 0000000..0bc0e40
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
@@ -0,0 +1 @@
+AnÖther test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
new file mode 100644
index 0000000..484ba93
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
@@ -0,0 +1 @@
+This is a test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
new file mode 100644
index 0000000..fc9bd6a
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
@@ -0,0 +1 @@
+Another test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
new file mode 100644
index 0000000..27cf4bf
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
@@ -0,0 +1 @@
+Tis is a test.
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt
new file mode 100644
index 0000000..0bc0e40
--- /dev/null
+++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt
@@ -0,0 +1 @@
+AnÖther test.

From ad8e6de36bf376a830af29e31cefa43066e5baff Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 12:34:08 +0100
Subject: [PATCH 02/20] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?=
 =?UTF-8?q?=20character=20diff=20reports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 03bf374..01fd585 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -81,7 +81,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             joiner="",
             none="·",
             score_hint=score_hint(l_cer, l_n_characters),
-        )
+        )[0]
         word_diff_report += gen_diff_report(
             gt_words,
             ocr_words,
@@ -89,7 +89,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
             joiner=" ",
             none="⋯",
             score_hint=score_hint(l_wer, l_n_words),
-        )
+        )[0]
 
     env = Environment(
         loader=FileSystemLoader(

From 2bf2529c380f028e59953584aa2aa26dc3a828b5 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 12:50:14 +0100
Subject: [PATCH 03/20] =?UTF-8?q?=F0=9F=9A=A7=20Port=20new=20line=20dir=20?=
 =?UTF-8?q?functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py  | 83 +++++++++++++++++++++++++-----
 src/dinglehopper/line_dirs_test.py | 71 -------------------------
 2 files changed, 69 insertions(+), 85 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 01fd585..43e4f1a 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,5 +1,6 @@
 import itertools
 import os
+from typing import Iterator, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -12,11 +13,36 @@ from .ocr_files import plain_extract
 from .word_error_rate import word_error_rate_n, words_normalized
 
 
+def removesuffix(text, suffix):
+    if suffix and text.endswith(suffix):
+        return text[: -len(suffix)]
+    return text
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
 def all_equal(iterable):
     g = itertools.groupby(iterable)
     return next(g, True) and not next(g, False)
 
-
 def common_prefix(its):
     return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
 
@@ -24,16 +50,49 @@ def common_prefix(its):
 def common_suffix(its):
     return reversed(common_prefix(reversed(it) for it in its))
 
+def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
 
-def removesuffix(text, suffix):
-    if suffix and text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
+            + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn
+
+
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
+
+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
 def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
-    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
-    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
 
     cer = None
     n_characters = None
@@ -42,14 +101,10 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     n_words = None
     word_diff_report = ""
 
-    for k, gt in enumerate(os.listdir(gt_dir)):
-        # Find a match by replacing the suffix
-        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+    for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)):
 
-        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
-        ocr_text = plain_extract(
-            os.path.join(ocr_dir, ocr), include_filename_in_id=True
-        )
+        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
+        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
         gt_words = words_normalized(gt_text)
         ocr_words = words_normalized(ocr_text)
 
diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py
index 676fe22..9827f01 100644
--- a/src/dinglehopper/line_dirs_test.py
+++ b/src/dinglehopper/line_dirs_test.py
@@ -2,78 +2,7 @@ import os.path
 import itertools
 from typing import Iterator, Tuple
 
-def is_hidden(filepath):
-    filename = os.path.basename(os.path.abspath(filepath))
-    return filename.startswith(".")
-
-def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
-    """
-    Find all files in dir_, returning filenames
-
-    If pred is given, pred(filename) must be True for the filename.
-
-    Does not return hidden files by default.
-    """
-    for root, _, filenames in os.walk(dir_):
-        for fn in filenames:
-            if not return_hidden and is_hidden(fn):
-                continue
-            if pred and not pred(fn):
-                continue
-            yield os.path.join(root, fn)
-
-
-def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
-    """
-    Find GT files and matching OCR files.
-
-    Returns pairs of GT and OCR files.
-    """
-    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
-        ocr_fn = os.path.join(
-            ocr_dir,
-            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
-            + ocr_suffix,
-        )
-        if not os.path.exists(ocr_fn):
-            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
-
-        yield gt_fn, ocr_fn
-
-def all_equal(iterable):
-    g = itertools.groupby(iterable)
-    return next(g, True) and not next(g, False)
-
-def common_prefix(its):
-    return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
-
-
-def common_suffix(its):
-    return reversed(common_prefix(reversed(it) for it in its))
-
-
-def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
-    """
-    Find GT files and matching OCR files, autodetect suffixes.
-
-    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
-    files with a common suffix. Currently the files must have a suffix, e.g.
-    ".gt.txt" (e.g. ".ocr.txt").
-
-    Returns pairs of GT and OCR files.
-    """
-
-    # Autodetect suffixes
-    gt_files = find_all_files(gt_dir)
-    gt_suffix = "".join(common_suffix(gt_files))
-    if len(gt_suffix) == 0:
-        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
-    ocr_files = find_all_files(ocr_dir)
-    ocr_suffix = "".join(common_suffix(ocr_files))
-    if len(ocr_suffix) == 0:
-        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
 
-    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
 def test_basic():

From 6980d7a2526380833ffd4d964e1f1b4c58bfed8a Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 13:21:49 +0100
Subject: [PATCH 04/20] =?UTF-8?q?=F0=9F=9A=A7=20Use=20our=20own=20removesu?=
 =?UTF-8?q?ffix()=20as=20we=20still=20support=20Python=203.8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 43e4f1a..30b2be1 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -14,6 +14,11 @@ from .word_error_rate import word_error_rate_n, words_normalized
 
 
 def removesuffix(text, suffix):
+    """
+    Remove suffix from text.
+
+    Can be replaced with str.removesuffix when we only support Python >= 3.9.
+    """
     if suffix and text.endswith(suffix):
         return text[: -len(suffix)]
     return text
@@ -59,7 +64,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu
     for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
         ocr_fn = os.path.join(
             ocr_dir,
-            os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix)
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix)
             + ocr_suffix,
         )
         if not os.path.exists(ocr_fn):

From 73ee16fe5181c29a06f7460ed1fb1dadd84d6cc2 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 13:59:55 +0100
Subject: [PATCH 05/20] =?UTF-8?q?=F0=9F=9A=A7=20Support=20'merged'=20GT+OC?=
 =?UTF-8?q?R=20line=20directories?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 30b2be1..44305d6 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
     yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
+def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None):
 
     cer = None
     n_characters = None
@@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
     n_words = None
     word_diff_report = ""
 
-    for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)):
+    if gt_suffix is not None and ocr_suffix is not None:
+        gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+    else:
+        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
 
+    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
         gt_text = plain_extract(gt_fn, include_filename_in_id=True)
         ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
         gt_words = words_normalized(gt_text)
@@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
 @click.option(
     "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
-def main(gt, ocr, report_prefix, metrics):
+@click.option("--gt-suffix", help="Suffix of GT line text files")
+@click.option("--ocr-suffix", help="Suffix of OCR line text files")
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
     """
     Compare the GT line text directory against the OCR line text directory.
 
     This assumes that the GT line text directory contains textfiles with a common
     suffix like ".gt.txt", and the OCR line text directory contains textfiles with
     a common suffix like ".some-ocr.txt". The text files also need to be paired,
-    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
-    in the OCT lines directory.
+    i.e. the GT filename "line001.gt.txt" needs to match a filename
+    "line001.some-ocr.txt" in the OCR lines directory.
 
-    The GT and OCR directories are usually round truth line texts and the results of
+    GT and OCR directories may contain line text files in matching subdirectories,
+    e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
+
+    GT and OCR directories can also be the same directory, but in this case you need
+    to give --gt-suffix and --ocr-suffix explicitly.
+
+    The GT and OCR directories are usually ground truth line texts and the results of
     an OCR software, but you may use dinglehopper to compare two OCR results. In
     that case, use --no-metrics to disable the then meaningless metrics and also
     change the color scheme from green/red to blue.
@@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics):
 
     """
     initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics)
+    process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix)
 
 
 if __name__ == "__main__":

From 68344e48f870968a92c6c51afb759c1fa47dea2b Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 14:49:40 +0100
Subject: [PATCH 06/20] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20cli=5Fline=5Fdi?=
 =?UTF-8?q?rs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 34 +++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 44305d6..9e806a1 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -23,11 +23,13 @@ def removesuffix(text, suffix):
         return text[: -len(suffix)]
     return text
 
+
 def is_hidden(filepath):
     filename = os.path.basename(os.path.abspath(filepath))
     return filename.startswith(".")
 
-def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]:
+
+def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]:
     """
     Find all files in dir_, returning filenames
 
@@ -48,6 +50,7 @@ def all_equal(iterable):
     g = itertools.groupby(iterable)
     return next(g, True) and not next(g, False)
 
+
 def common_prefix(its):
     return [p[0] for p in itertools.takewhile(all_equal, zip(*its))]
 
@@ -55,7 +58,10 @@ def common_prefix(its):
 def common_suffix(its):
     return reversed(common_prefix(reversed(it) for it in its))
 
-def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]:
+
+def find_gt_and_ocr_files(
+    gt_dir, gt_suffix, ocr_dir, ocr_suffix
+) -> Iterator[Tuple[str, str]]:
     """
     Find GT files and matching OCR files.
 
@@ -64,8 +70,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu
     for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
         ocr_fn = os.path.join(
             ocr_dir,
-            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix)
-            + ocr_suffix,
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
         )
         if not os.path.exists(ocr_fn):
             raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
@@ -88,16 +93,22 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
     gt_files = find_all_files(gt_dir)
     gt_suffix = "".join(common_suffix(gt_files))
     if len(gt_suffix) == 0:
-        raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix")
+        raise RuntimeError(
+            f"Files in GT directory {gt_dir} do not have a common suffix"
+        )
     ocr_files = find_all_files(ocr_dir)
     ocr_suffix = "".join(common_suffix(ocr_files))
     if len(ocr_suffix) == 0:
-        raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix")
+        raise RuntimeError(
+            f"Files in OCR directory {ocr_dir} do not have a common suffix"
+        )
 
     yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 
 
-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None):
+def process(
+    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+):
 
     cer = None
     n_characters = None
@@ -216,7 +227,14 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
 
     """
     initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix)
+    process(
+        gt,
+        ocr,
+        report_prefix,
+        metrics=metrics,
+        gt_suffix=gt_suffix,
+        ocr_suffix=ocr_suffix,
+    )
 
 
 if __name__ == "__main__":

From 9414a92f9f31760a694c44f06069f7677e679078 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 15:19:37 +0100
Subject: [PATCH 07/20] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Type-?=
 =?UTF-8?q?annotate=20functions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 9e806a1..2cd4fe6 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,6 +1,6 @@
 import itertools
 import os
-from typing import Iterator, Tuple
+from typing import Callable, Iterator, Optional, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -29,7 +29,9 @@ def is_hidden(filepath):
     return filename.startswith(".")
 
 
-def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]:
+def find_all_files(
+    dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
+) -> Iterator[str]:
     """
     Find all files in dir_, returning filenames
 
@@ -60,7 +62,7 @@ def common_suffix(its):
 
 
 def find_gt_and_ocr_files(
-    gt_dir, gt_suffix, ocr_dir, ocr_suffix
+    gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
 ) -> Iterator[Tuple[str, str]]:
     """
     Find GT files and matching OCR files.

From c37316da097d18b74f0da2398b53b64ab712495f Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Dec 2024 19:57:12 +0100
Subject: [PATCH 08/20] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?=
 =?UTF-8?q?=20word=20differences=20section?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At the time of generation of the section, the {gt,ocr}_words generators
were drained. Fix by using a list.

Fixes gh-124.
---
 src/dinglehopper/cli_line_dirs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 2cd4fe6..2861d6f 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,6 +1,6 @@
 import itertools
 import os
-from typing import Callable, Iterator, Optional, Tuple
+from typing import Callable, Iterator, Optional, Tuple, List
 
 import click
 from jinja2 import Environment, FileSystemLoader
@@ -127,8 +127,8 @@ def process(
     for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
         gt_text = plain_extract(gt_fn, include_filename_in_id=True)
         ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
-        gt_words = words_normalized(gt_text)
-        ocr_words = words_normalized(ocr_text)
+        gt_words: List[str] = list(words_normalized(gt_text))
+        ocr_words: List[str] = list(words_normalized(ocr_text))
 
         # Compute CER
         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)

From 322faeb26c2c60d8d777ab6132b9af397d0fd510 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 09:21:09 +0100
Subject: [PATCH 09/20] =?UTF-8?q?=F0=9F=8E=A8=20Sort=20imports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 2861d6f..5cd1bfa 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -1,6 +1,6 @@
 import itertools
 import os
-from typing import Callable, Iterator, Optional, Tuple, List
+from typing import Callable, Iterator, List, Optional, Tuple
 
 import click
 from jinja2 import Environment, FileSystemLoader

From 3b16c14c16dd00500574b74031107768d5cbb465 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 09:50:24 +0100
Subject: [PATCH 10/20] =?UTF-8?q?=E2=9C=94=20=20Properly=20test=20line=20d?=
 =?UTF-8?q?ir=20finding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |  1 +
 .../data/line_dirs}/basic/gt/a.gt.txt         |  0
 .../data/line_dirs}/basic/gt/b.gt.txt         |  0
 .../data/line_dirs}/basic/ocr/a.some-ocr.txt  |  0
 .../data/line_dirs}/basic/ocr/b.some-ocr.txt  |  0
 .../data/line_dirs}/merged/a/a.dummy.jpg      |  0
 .../data/line_dirs}/merged/a/a.gt.txt         |  0
 .../data/line_dirs}/merged/a/a.some-ocr.txt   |  0
 .../data/line_dirs}/merged/b/b.dummy.jpg      |  0
 .../data/line_dirs}/merged/b/b.gt.txt         |  0
 .../data/line_dirs}/merged/b/b.some-ocr.txt   |  0
 .../data/line_dirs}/subdirs/gt/a/a.gt.txt     |  0
 .../data/line_dirs}/subdirs/gt/b/b.gt.txt     |  0
 .../line_dirs}/subdirs/ocr/a/a.some-ocr.txt   |  0
 .../line_dirs}/subdirs/ocr/b/b.some-ocr.txt   |  0
 .../test_line_dirs.py}                        | 40 ++++++++-----------
 16 files changed, 18 insertions(+), 23 deletions(-)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/a.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/b.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/a.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/b.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.dummy.jpg (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.dummy.jpg (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/a/a.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/b/b.gt.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/a/a.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/b/b.some-ocr.txt (100%)
 rename src/dinglehopper/{line_dirs_test.py => tests/test_line_dirs.py} (52%)

diff --git a/.gitignore b/.gitignore
index d931831..66d66bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ dmypy.json
 
 # User-specific stuff
 .idea
+.*.swp
 
 # Build artifacts
 /build
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg
rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/a/a.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg
rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/b/b.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
similarity index 100%
rename from src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt
rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/tests/test_line_dirs.py
similarity index 52%
rename from src/dinglehopper/line_dirs_test.py
rename to src/dinglehopper/tests/test_line_dirs.py
index 9827f01..03966e1 100644
--- a/src/dinglehopper/line_dirs_test.py
+++ b/src/dinglehopper/tests/test_line_dirs.py
@@ -1,29 +1,30 @@
-import os.path
-import itertools
-from typing import Iterator, Tuple
+import os
 
+from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect
 
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 
 
 def test_basic():
     """Test the dumb method: User gives directories and suffixes."""
     pairs = list(
         find_gt_and_ocr_files(
-            "line_dirs_test/basic/gt",
+            os.path.join(data_dir, "line_dirs/basic/gt"),
             ".gt.txt",
-            "line_dirs_test/basic/ocr",
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
             ".some-ocr.txt",
         )
     )
 
     assert len(pairs) == 2
 
+
 def test_basic_autodetect():
-    """Test the autodetect method: User gives directories, suffixes are autodetected if possible"""
+    """Test autodetect: User gives directories, suffixes are autodetected if possible"""
     pairs = list(
         find_gt_and_ocr_files_autodetect(
-            "line_dirs_test/basic/gt",
-            "line_dirs_test/basic/ocr",
+            os.path.join(data_dir, "line_dirs/basic/gt"),
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
         )
     )
 
@@ -34,9 +35,9 @@ def test_subdirs():
     """Test the dumb method: Should also work when subdirectories are involved."""
     pairs = list(
         find_gt_and_ocr_files(
-            "line_dirs_test/subdirs/gt",
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
             ".gt.txt",
-            "line_dirs_test/subdirs/ocr",
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
             ".some-ocr.txt",
         )
     )
@@ -48,30 +49,23 @@ def test_subdirs_autodetect():
     """Test the autodetect method: Should also work when subdirectories are involved."""
     pairs = list(
         find_gt_and_ocr_files_autodetect(
-            "line_dirs_test/subdirs/gt",
-            "line_dirs_test/subdirs/ocr",
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
         )
     )
 
     assert len(pairs) == 2
 
+
 def test_merged():
-    """Test the dumb method: Should also work when GT and OCR texts are in the same directories."""
+    """Test the dumb method: GT and OCR texts are in the same directories."""
     pairs = list(
         find_gt_and_ocr_files(
-            "line_dirs_test/merged",
+            os.path.join(data_dir, "line_dirs/merged"),
             ".gt.txt",
-            "line_dirs_test/merged",
+            os.path.join(data_dir, "line_dirs/merged"),
             ".some-ocr.txt",
         )
     )
 
     assert len(pairs) == 2
-
-if __name__ == "__main__":
-    test_basic()
-    test_subdirs()
-    test_merged()
-
-    test_basic_autodetect()
-    test_subdirs_autodetect()

From f1a586cff1d306d3fbef95c8110af74d3941a894 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 10:36:58 +0100
Subject: [PATCH 11/20] =?UTF-8?q?=E2=9C=94=20=20Test=20line=20dirs=20CLI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/test_integ_cli_line_dirs.py         | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 src/dinglehopper/tests/test_integ_cli_line_dirs.py

diff --git a/src/dinglehopper/tests/test_integ_cli_line_dirs.py b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
new file mode 100644
index 0000000..90cbabf
--- /dev/null
+++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
@@ -0,0 +1,61 @@
+import json
+import os.path
+import re
+
+import pytest
+
+from ..cli_line_dirs import process
+from .util import working_directory
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic_report_diff(tmp_path):
+    """Test that the cli/process() produces a report wiff char+word diff"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+
+    # Counting GT lines in the diff
+    assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2
+    assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_merged(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/merged")
+        ocr_dir = os.path.join(data_dir, "line_dirs/merged")
+        process(
+            gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt"
+        )
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)

From 480b3cf864ba1ba5c26ed550760b53193b91e93d Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Sat, 14 Dec 2024 11:14:07 +0100
Subject: [PATCH 12/20] =?UTF-8?q?=E2=9C=94=20=20Test=20that=20CLI=20produc?=
 =?UTF-8?q?es=20a=20complete=20HTML=20report?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...json.py => test_integ_cli_valid_report.py} | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 rename src/dinglehopper/tests/{test_integ_cli_valid_json.py => test_integ_cli_valid_report.py} (64%)

diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_report.py
similarity index 64%
rename from src/dinglehopper/tests/test_integ_cli_valid_json.py
rename to src/dinglehopper/tests/test_integ_cli_valid_report.py
index 6cbfa0c..fed0d28 100644
--- a/src/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py
@@ -1,4 +1,5 @@
 import json
+import re
 
 import pytest
 
@@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path):
         with open("report.json", "r") as jsonf:
             j = json.load(jsonf)
             assert j["cer"] == pytest.approx(float("inf"))
+
+
+@pytest.mark.integration
+def test_cli_html(tmp_path):
+    """Test that the cli/process() yields complete HTML report"""
+
+    with working_directory(tmp_path):
+        with open("gt.txt", "w") as gtf:
+            gtf.write("AAAAA")
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("AAAAB")
+
+        process("gt.txt", "ocr.txt", "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+            print(html_report)
+
+        assert re.search(r"CER: 0\.\d+", html_report)
+        assert re.search(r"WER: 1\.0", html_report)
+        assert len(re.findall("gt.*cdiff", html_report)) == 1
+        assert len(re.findall("gt.*wdiff", html_report)) == 1

From cf59b951a3a30cd23e36a0bb2e553f2d6abcee20 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 4 Feb 2025 13:54:28 +0100
Subject: [PATCH 13/20] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text?=
 =?UTF-8?q?=20encoding=20to=20line=20dir=20cli?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_line_dirs.py | 27 +++++++++++++++++++++++----
 src/dinglehopper/ocr_files.py     | 22 ++++++++++++++++------
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 5cd1bfa..4064de0 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
 
 
 def process(
-    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
 ):
 
     cer = None
@@ -125,8 +132,12 @@ def process(
         gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
 
     for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
-        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
-        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
+        ocr_text = plain_extract(
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
         gt_words: List[str] = list(words_normalized(gt_text))
         ocr_words: List[str] = list(words_normalized(ocr_text))
 
@@ -202,7 +213,12 @@ def process(
 )
 @click.option("--gt-suffix", help="Suffix of GT line text files")
 @click.option("--ocr-suffix", help="Suffix of OCR line text files")
-def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding  (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
     """
     Compare the GT line text directory against the OCR line text directory.
 
@@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
     $REPORT_PREFIX defaults to "report". The reports include the character error
     rate (CER) and the word error rate (WER).
 
+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
     """
     initLogging()
     process(
@@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
         metrics=metrics,
         gt_suffix=gt_suffix,
         ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
     )
 
 
diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 1593f44..1eecebb 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
+log = getLogger("processor.OcrdDinglehopperEvaluate")
+
 
 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
     """Return the ALTO namespace used in the given ElementTree.
@@ -149,7 +152,7 @@ def detect_encoding(filename):
     return chardet.detect(open(filename, "rb").read(1024))["encoding"]
 
 
-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
     id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
 
     def make_segment(no, line):
@@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
             clusters,
         )
 
-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
+        fileencoding = detect_encoding(filename)
+        log.warn(
+            f"Autodetected encoding as '{fileencoding}'"
+            ", it is recommended to specify it explicitly with --plain-encoding"
+        )
+    else:
+        fileencoding = encoding
     with open(filename, "r", encoding=fileencoding) as f:
         return ExtractedText(
             None,
@@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
     # XXX hardcoded SBB normalization
 
 
-def plain_text(filename):
-    return plain_extract(filename).text
+def plain_text(filename, encoding="autodetect"):
+    return plain_extract(filename, encoding=encoding).text
 
 
-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
     """Extract the text from the given file.
 
     Supports PAGE, ALTO and falls back to plain text.
@@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
     try:
         tree = ET.parse(filename)
     except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
     try:
         return page_extract(tree, textequiv_level=textequiv_level)
     except ValueError:

From 5578ce83a3600bbe6f6a0a2679f2b35c90b34fe4 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 13 Feb 2025 16:39:29 +0100
Subject: [PATCH 14/20] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text?=
 =?UTF-8?q?=20encoding=20to=20line=20dir=20cli?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index b67e9cc..5e5e81c 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -114,6 +114,7 @@ def process(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
     """Check OCR result against GT.
 
@@ -121,8 +122,12 @@ def process(
     this undecorated version and use Click on a wrapper.
     """
 
-    gt_text = extract(gt, textequiv_level=textequiv_level)
-    ocr_text = extract(ocr, textequiv_level=textequiv_level)
+    gt_text = extract(
+        gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
+    ocr_text = extract(
+        ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
     gt_words: List[str] = list(words_normalized(gt_text))
     ocr_words: List[str] = list(words_normalized(ocr_text))
 
@@ -195,6 +200,7 @@ def process_dir(
     metrics: bool = True,
     differences: bool = False,
     textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
     for gt_file in os.listdir(gt):
         gt_file_path = os.path.join(gt, gt_file)
@@ -209,6 +215,7 @@ def process_dir(
                 metrics=metrics,
                 differences=differences,
                 textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
             )
         else:
             print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@@ -233,6 +240,11 @@ def process_dir(
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding  (e.g. "utf-8") of plain text files',
+)
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
 @click.version_option()
 def main(
@@ -243,6 +255,7 @@ def main(
     metrics,
     differences,
     textequiv_level,
+    plain_encoding,
     progress,
 ):
     """
@@ -280,6 +293,7 @@ def main(
                 metrics=metrics,
                 differences=differences,
                 textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
             )
     else:
         process(
@@ -290,6 +304,7 @@ def main(
             metrics=metrics,
             differences=differences,
             textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
         )
 
 

From 9db5b4caf5b6335066e121a231cee1b1298bfbfa Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 13 Feb 2025 16:48:50 +0100
Subject: [PATCH 15/20] =?UTF-8?q?=F0=9F=9A=A7=20Add=20OCR-D=20parameter=20?=
 =?UTF-8?q?for=20plain=20text=20encoding?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocrd-tool.json | 5 +++++
 src/dinglehopper/ocrd_cli.py    | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json
index 43795e1..ae7c9bb 100644
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@@ -25,6 +25,11 @@
           "enum": ["region", "line"],
           "default": "region",
           "description": "PAGE XML hierarchy level to extract the text from"
+        },
+        "plain_encoding": {
+          "type": "string",
+          "default": "autodetect",
+          "description": "Encoding (e.g. \"utf-8\") of plain text files"
         }
       }
     }
diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py
index fa4747f..2d7da8e 100644
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@@ -26,6 +26,7 @@ class OcrdDinglehopperEvaluate(Processor):
         assert self.parameter
         metrics = self.parameter["metrics"]
         textequiv_level = self.parameter["textequiv_level"]
+        plain_encoding = self.parameter["plain_encoding"]
 
         # wrong number of inputs: let fail
         gt_file, ocr_file = input_files
@@ -52,6 +53,7 @@ class OcrdDinglehopperEvaluate(Processor):
             self.output_file_grp,
             metrics=metrics,
             textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
         )
 
         # Add reports to the workspace

From 224aa02163b5ba28a4f44569b4cbb04d0dae4188 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 13 Feb 2025 16:50:21 +0100
Subject: [PATCH 16/20] =?UTF-8?q?=F0=9F=9A=A7=20Fix=20help=20text?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli.py           | 2 +-
 src/dinglehopper/cli_line_dirs.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py
index 5e5e81c..2d3c075 100644
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@@ -243,7 +243,7 @@ def process_dir(
 @click.option(
     "--plain-encoding",
     default="autodetect",
-    help='Encoding  (e.g. "utf-8") of plain text files',
+    help='Encoding (e.g. "utf-8") of plain text files',
 )
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
 @click.version_option()
diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 4064de0..0160f87 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -216,7 +216,7 @@ def process(
 @click.option(
     "--plain-encoding",
     default="autodetect",
-    help='Encoding  (e.g. "utf-8") of plain text files',
+    help='Encoding (e.g. "utf-8") of plain text files',
 )
 def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
     """

From a70260c10edbff774fcae1d3f636b2b5e806d4ae Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 13:56:13 +0200
Subject: [PATCH 17/20] =?UTF-8?q?=F0=9F=90=9B=20Use=20warning()=20to=20fix?=
 =?UTF-8?q?=20DeprecationWarning?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/ocr_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 1eecebb..fdcaf54 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -168,7 +168,7 @@ def plain_extract(filename, include_filename_in_id=False, encoding="autodetect")
 
     if encoding == "autodetect":
         fileencoding = detect_encoding(filename)
-        log.warn(
+        log.warning(
             f"Autodetected encoding as '{fileencoding}'"
             ", it is recommended to specify it explicitly with --plain-encoding"
         )

From 14a4bc56d85bd953153bf64bcb95a92413814efb Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 22 Apr 2025 18:24:35 +0200
Subject: [PATCH 18/20] =?UTF-8?q?=F0=9F=90=9B=20Add=20--plain-encoding=20o?=
 =?UTF-8?q?ption=20to=20dinglehopper-extract?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/cli_extract.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py
index 9c51d34..5fce032 100644
--- a/src/dinglehopper/cli_extract.py
+++ b/src/dinglehopper/cli_extract.py
@@ -12,7 +12,12 @@ from .ocr_files import extract
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
-def main(input_file, textequiv_level):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(input_file, textequiv_level, plain_encoding):
     """
     Extract the text of the given INPUT_FILE.
 
@@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
     use "--textequiv-level line" to extract from the level of TextLine tags.
     """
     initLogging()
-    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    input_text = extract(
+        input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    ).text
     print(input_text)
 
 

From 9fc8937324b8ba2c94ddd865fb8c05fa5f92c49d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 24 Apr 2025 15:13:19 +0200
Subject: [PATCH 19/20] =?UTF-8?q?=E2=9C=92=20=20README:=20Mention=20dingle?=
 =?UTF-8?q?hopper-line-dirs=20--help?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 76fcc5a..a40db79 100644
--- a/README.md
+++ b/README.md
@@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:
 
-~~~
+```
 dinglehopper-line-dirs gt/ ocr/
-~~~
+```
+
+The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
+directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
+in this case.
 
 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on

From 5639f3db7f12647694c4ef03437af00227f45f58 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 24 Apr 2025 16:44:29 +0200
Subject: [PATCH 20/20] =?UTF-8?q?=E2=9C=94=20=20Add=20a=20tests=20that=20c?=
 =?UTF-8?q?hecks=20if=20plain=20text=20files=20with=20BOM=20are=20read=20c?=
 =?UTF-8?q?orrectly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/dinglehopper/tests/test_ocr_files.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py
index 342507a..0c2a500 100644
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@@ -182,3 +182,15 @@ def test_plain(tmp_path):
         result = plain_text("ocr.txt")
         expected = "First, a line.\nAnd a second line."
         assert result == expected
+
+
+def test_plain_BOM(tmp_path):
+    """Test that plain text files with BOM are read correctly."""
+    BOM = "\ufeff"
+    with working_directory(tmp_path):
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write(BOM + "First, a line.\nAnd a second line.\n")
+
+        result = plain_text("ocr.txt")
+        expected = "First, a line.\nAnd a second line."
+        assert result == expected