Merge pull request #142 from qurator-spk/feat/flex-line-dirs

Feat/flex line dirs
2025-09-18 13:19:54 +02:00 · 2025-04-24 16:48:22 +02:00 · 2025-04-24 16:48:22 +02:00 · d7814db705
commit d7814db705
parent 3c317cbeaf 5639f3db7f
26 changed files with 368 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@ -25,6 +25,7 @@ dmypy.json
 # User-specific stuff
 .idea
 .*.swp
 # Build artifacts
 /build
--- a/README.md
+++ b/README.md
@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:
-~~~
+```
 dinglehopper-line-dirs gt/ ocr/
-~~~
+```
 The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
 directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
 in this case.
 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@ -114,6 +114,7 @@ def process(
    metrics: bool = True,
    differences: bool = False,
    textequiv_level: str = "region",
    plain_encoding: str = "autodetect",
 ) -> None:
    """Check OCR result against GT.
@ -121,8 +122,12 @@ def process(
    this undecorated version and use Click on a wrapper.
    """
-    gt_text = extract(gt, textequiv_level=textequiv_level)
+    gt_text = extract(
-    ocr_text = extract(ocr, textequiv_level=textequiv_level)
+        gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
    )
    ocr_text = extract(
        ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
    )
    gt_words: List[str] = list(words_normalized(gt_text))
    ocr_words: List[str] = list(words_normalized(ocr_text))
@ -195,6 +200,7 @@ def process_dir(
    metrics: bool = True,
    differences: bool = False,
    textequiv_level: str = "region",
    plain_encoding: str = "autodetect",
 ) -> None:
    for gt_file in os.listdir(gt):
        gt_file_path = os.path.join(gt, gt_file)
@ -209,6 +215,7 @@ def process_dir(
                metrics=metrics,
                differences=differences,
                textequiv_level=textequiv_level,
                plain_encoding=plain_encoding,
            )
        else:
            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@ -233,6 +240,11 @@ def process_dir(
    help="PAGE TextEquiv level to extract text from",
    metavar="LEVEL",
 )
@click.option(
    "--plain-encoding",
    default="autodetect",
    help='Encoding (e.g. "utf-8") of plain text files',
 )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
@click.version_option()
 def main(
@ -243,6 +255,7 @@ def main(
    metrics,
    differences,
    textequiv_level,
    plain_encoding,
    progress,
 ):
    """
@ -280,6 +293,7 @@ def main(
                metrics=metrics,
                differences=differences,
                textequiv_level=textequiv_level,
                plain_encoding=plain_encoding,
            )
    else:
        process(
@ -290,6 +304,7 @@ def main(
            metrics=metrics,
            differences=differences,
            textequiv_level=textequiv_level,
            plain_encoding=plain_encoding,
        )
--- a/src/dinglehopper/cli_extract.py
+++ b/src/dinglehopper/cli_extract.py
@ -12,7 +12,12 @@ from .ocr_files import extract
    help="PAGE TextEquiv level to extract text from",
    metavar="LEVEL",
 )
-def main(input_file, textequiv_level):
+@click.option(
    "--plain-encoding",
    default="autodetect",
    help='Encoding (e.g. "utf-8") of plain text files',
 )
 def main(input_file, textequiv_level, plain_encoding):
    """
    Extract the text of the given INPUT_FILE.
@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    initLogging()
-    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    input_text = extract(
        input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
    ).text
    print(input_text)
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@ -1,5 +1,6 @@
 import itertools
 import os
 from typing import Callable, Iterator, List, Optional, Tuple
 import click
 from jinja2 import Environment, FileSystemLoader
@ -12,6 +13,41 @@ from .ocr_files import plain_extract
 from .word_error_rate import word_error_rate_n, words_normalized
 def removesuffix(text, suffix):
    """
    Remove suffix from text.
    Can be replaced with str.removesuffix when we only support Python >= 3.9.
    """
    if suffix and text.endswith(suffix):
        return text[: -len(suffix)]
    return text
 def is_hidden(filepath):
    filename = os.path.basename(os.path.abspath(filepath))
    return filename.startswith(".")
 def find_all_files(
    dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
 ) -> Iterator[str]:
    """
    Find all files in dir_, returning filenames
    If pred is given, pred(filename) must be True for the filename.
    Does not return hidden files by default.
    """
    for root, _, filenames in os.walk(dir_):
        for fn in filenames:
            if not return_hidden and is_hidden(fn):
                continue
            if pred and not pred(fn):
                continue
            yield os.path.join(root, fn)
 def all_equal(iterable):
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
@ -25,15 +61,63 @@ def common_suffix(its):
    return reversed(common_prefix(reversed(it) for it in its))
-def removesuffix(text, suffix):
+def find_gt_and_ocr_files(
-    if suffix and text.endswith(suffix):
+    gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
-        return text[: -len(suffix)]
+) -> Iterator[Tuple[str, str]]:
-    return text
+    """
    Find GT files and matching OCR files.
    Returns pairs of GT and OCR files.
    """
    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
        ocr_fn = os.path.join(
            ocr_dir,
            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
        )
        if not os.path.exists(ocr_fn):
            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
        yield gt_fn, ocr_fn
-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
-    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
+    """
-    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
+    Find GT files and matching OCR files, autodetect suffixes.
    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
    files with a common suffix. Currently the files must have a suffix, e.g.
    ".gt.txt" (e.g. ".ocr.txt").
    Returns pairs of GT and OCR files.
    """
    # Autodetect suffixes
    gt_files = find_all_files(gt_dir)
    gt_suffix = "".join(common_suffix(gt_files))
    if len(gt_suffix) == 0:
        raise RuntimeError(
            f"Files in GT directory {gt_dir} do not have a common suffix"
        )
    ocr_files = find_all_files(ocr_dir)
    ocr_suffix = "".join(common_suffix(ocr_files))
    if len(ocr_suffix) == 0:
        raise RuntimeError(
            f"Files in OCR directory {ocr_dir} do not have a common suffix"
        )
    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
 def process(
    gt_dir,
    ocr_dir,
    report_prefix,
    *,
    metrics=True,
    gt_suffix=None,
    ocr_suffix=None,
    plain_encoding="autodetect",
 ):
    cer = None
    n_characters = None
@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
    n_words = None
    word_diff_report = ""
-    for k, gt in enumerate(os.listdir(gt_dir)):
+    if gt_suffix is not None and ocr_suffix is not None:
-        # Find a match by replacing the suffix
+        gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
-        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+    else:
        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
-        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
+    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
-        ocr_text = plain_extract(
+        gt_text = plain_extract(
-            os.path.join(ocr_dir, ocr), include_filename_in_id=True
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
        )
-        gt_words = words_normalized(gt_text)
+        ocr_text = plain_extract(
-        ocr_words = words_normalized(ocr_text)
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
        )
        gt_words: List[str] = list(words_normalized(gt_text))
        ocr_words: List[str] = list(words_normalized(ocr_text))
        # Compute CER
        l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -81,7 +169,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
            joiner="",
            none="·",
            score_hint=score_hint(l_cer, l_n_characters),
-        )
+        )[0]
        word_diff_report += gen_diff_report(
            gt_words,
            ocr_words,
@ -89,7 +177,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
            joiner=" ",
            none="⋯",
            score_hint=score_hint(l_wer, l_n_words),
-        )
+        )[0]
    env = Environment(
        loader=FileSystemLoader(
@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
-def main(gt, ocr, report_prefix, metrics):
+@click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
@click.option(
    "--plain-encoding",
    default="autodetect",
    help='Encoding (e.g. "utf-8") of plain text files',
 )
 def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
    """
    Compare the GT line text directory against the OCR line text directory.
    This assumes that the GT line text directory contains textfiles with a common
    suffix like ".gt.txt", and the OCR line text directory contains textfiles with
    a common suffix like ".some-ocr.txt". The text files also need to be paired,
-    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
+    i.e. the GT filename "line001.gt.txt" needs to match a filename
-    in the OCT lines directory.
+    "line001.some-ocr.txt" in the OCR lines directory.
-    The GT and OCR directories are usually round truth line texts and the results of
+    GT and OCR directories may contain line text files in matching subdirectories,
    e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
    GT and OCR directories can also be the same directory, but in this case you need
    to give --gt-suffix and --ocr-suffix explicitly.
    The GT and OCR directories are usually ground truth line texts and the results of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).
    It is recommended to specify the encoding of the text files, for example with
    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
    """
    initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics)
+    process(
        gt,
        ocr,
        report_prefix,
        metrics=metrics,
        gt_suffix=gt_suffix,
        ocr_suffix=ocr_suffix,
        plain_encoding=plain_encoding,
    )
 if __name__ == "__main__":
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
 from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters
 from .extracted_text import ExtractedText, normalize_sbb
 log = getLogger("processor.OcrdDinglehopperEvaluate")
 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
    """Return the ALTO namespace used in the given ElementTree.
@ -149,7 +152,7 @@ def detect_encoding(filename):
    return chardet.detect(open(filename, "rb").read(1024))["encoding"]
-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
    def make_segment(no, line):
@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )
-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
        fileencoding = detect_encoding(filename)
        log.warning(
            f"Autodetected encoding as '{fileencoding}'"
            ", it is recommended to specify it explicitly with --plain-encoding"
        )
    else:
        fileencoding = encoding
    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
    # XXX hardcoded SBB normalization
-def plain_text(filename):
+def plain_text(filename, encoding="autodetect"):
-    return plain_extract(filename).text
+    return plain_extract(filename, encoding=encoding).text
-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
    """Extract the text from the given file.
    Supports PAGE, ALTO and falls back to plain text.
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
    try:
        tree = ET.parse(filename)
    except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
    except ValueError:
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@ -25,6 +25,11 @@
          "enum": ["region", "line"],
          "default": "region",
          "description": "PAGE XML hierarchy level to extract the text from"
        },
        "plain_encoding": {
          "type": "string",
          "default": "autodetect",
          "description": "Encoding (e.g. \"utf-8\") of plain text files"
        }
      }
    }
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@ -26,6 +26,7 @@ class OcrdDinglehopperEvaluate(Processor):
        assert self.parameter
        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
        plain_encoding = self.parameter["plain_encoding"]
        # wrong number of inputs: let fail
        gt_file, ocr_file = input_files
@ -52,6 +53,7 @@ class OcrdDinglehopperEvaluate(Processor):
            self.output_file_grp,
            metrics=metrics,
            textequiv_level=textequiv_level,
            plain_encoding=plain_encoding,
        )
        # Add reports to the workspace
--- a/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
@ -0,0 +1 @@
 This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
@ -0,0 +1 @@
 Another test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
@ -0,0 +1 @@
 Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
@ -0,0 +1 @@
 AnÖther test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
@ -0,0 +1 @@
 This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
@ -0,0 +1 @@
 Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
@ -0,0 +1 @@
 Another test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
@ -0,0 +1 @@
 AnÖther test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
@ -0,0 +1 @@
 This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
@ -0,0 +1 @@
 Another test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
@ -0,0 +1 @@
 Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
@ -0,0 +1 @@
 AnÖther test.
--- a/src/dinglehopper/tests/test_integ_cli_line_dirs.py
+++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
@ -0,0 +1,61 @@
 import json
 import os.path
 import re
 import pytest
 from ..cli_line_dirs import process
 from .util import working_directory
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
 def test_cli_line_dirs_basic(tmp_path):
    """Test that the cli/process() produces a good report"""
    with working_directory(tmp_path):
        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
        process(gt_dir, ocr_dir, "report")
        with open("report.json", "r") as jsonf:
            print(jsonf.read())
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            assert j["cer"] == pytest.approx(0.1071429)
            assert j["wer"] == pytest.approx(0.5)
@pytest.mark.integration
 def test_cli_line_dirs_basic_report_diff(tmp_path):
    """Test that the cli/process() produces a report wiff char+word diff"""
    with working_directory(tmp_path):
        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
        process(gt_dir, ocr_dir, "report")
        with open("report.html", "r") as htmlf:
            html_report = htmlf.read()
    # Counting GT lines in the diff
    assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2
    assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2
@pytest.mark.integration
 def test_cli_line_dirs_merged(tmp_path):
    """Test that the cli/process() produces a good report"""
    with working_directory(tmp_path):
        gt_dir = os.path.join(data_dir, "line_dirs/merged")
        ocr_dir = os.path.join(data_dir, "line_dirs/merged")
        process(
            gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt"
        )
        with open("report.json", "r") as jsonf:
            print(jsonf.read())
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            assert j["cer"] == pytest.approx(0.1071429)
            assert j["wer"] == pytest.approx(0.5)
--- a/src/dinglehopper/tests/test_integ_cli_valid_report.py
+++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py
@ -1,4 +1,5 @@
 import json
 import re
 import pytest
@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path):
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            assert j["cer"] == pytest.approx(float("inf"))
@pytest.mark.integration
 def test_cli_html(tmp_path):
    """Test that the cli/process() yields complete HTML report"""
    with working_directory(tmp_path):
        with open("gt.txt", "w") as gtf:
            gtf.write("AAAAA")
        with open("ocr.txt", "w") as ocrf:
            ocrf.write("AAAAB")
        process("gt.txt", "ocr.txt", "report")
        with open("report.html", "r") as htmlf:
            html_report = htmlf.read()
            print(html_report)
        assert re.search(r"CER: 0\.\d+", html_report)
        assert re.search(r"WER: 1\.0", html_report)
        assert len(re.findall("gt.*cdiff", html_report)) == 1
        assert len(re.findall("gt.*wdiff", html_report)) == 1
--- a/src/dinglehopper/tests/test_line_dirs.py
+++ b/src/dinglehopper/tests/test_line_dirs.py
@ -0,0 +1,71 @@
 import os
 from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect
 data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 def test_basic():
    """Test the dumb method: User gives directories and suffixes."""
    pairs = list(
        find_gt_and_ocr_files(
            os.path.join(data_dir, "line_dirs/basic/gt"),
            ".gt.txt",
            os.path.join(data_dir, "line_dirs/basic/ocr"),
            ".some-ocr.txt",
        )
    )
    assert len(pairs) == 2
 def test_basic_autodetect():
    """Test autodetect: User gives directories, suffixes are autodetected if possible"""
    pairs = list(
        find_gt_and_ocr_files_autodetect(
            os.path.join(data_dir, "line_dirs/basic/gt"),
            os.path.join(data_dir, "line_dirs/basic/ocr"),
        )
    )
    assert len(pairs) == 2
 def test_subdirs():
    """Test the dumb method: Should also work when subdirectories are involved."""
    pairs = list(
        find_gt_and_ocr_files(
            os.path.join(data_dir, "line_dirs/subdirs/gt"),
            ".gt.txt",
            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
            ".some-ocr.txt",
        )
    )
    assert len(pairs) == 2
 def test_subdirs_autodetect():
    """Test the autodetect method: Should also work when subdirectories are involved."""
    pairs = list(
        find_gt_and_ocr_files_autodetect(
            os.path.join(data_dir, "line_dirs/subdirs/gt"),
            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
        )
    )
    assert len(pairs) == 2
 def test_merged():
    """Test the dumb method: GT and OCR texts are in the same directories."""
    pairs = list(
        find_gt_and_ocr_files(
            os.path.join(data_dir, "line_dirs/merged"),
            ".gt.txt",
            os.path.join(data_dir, "line_dirs/merged"),
            ".some-ocr.txt",
        )
    )
    assert len(pairs) == 2
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@ -182,3 +182,15 @@ def test_plain(tmp_path):
        result = plain_text("ocr.txt")
        expected = "First, a line.\nAnd a second line."
        assert result == expected
 def test_plain_BOM(tmp_path):
    """Test that plain text files with BOM are read correctly."""
    BOM = "\ufeff"
    with working_directory(tmp_path):
        with open("ocr.txt", "w") as ocrf:
            ocrf.write(BOM + "First, a line.\nAnd a second line.\n")
        result = plain_text("ocr.txt")
        expected = "First, a line.\nAnd a second line."
        assert result == expected