🐛 Fix reading plain text files

As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107.
2026-03-16 20:22:03 +01:00 · 2024-05-06 18:14:16 +02:00 · 2024-05-06 18:14:16 +02:00 · b336f98271
commit b336f98271
parent 41a0fad352
2 changed files with 4 additions and 4 deletions
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
    for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
        line_id = line.attrib.get("ID")
        line_text = " ".join(
-            string.attrib.get("CONTENT")
+            string.attrib.get("CONTENT", "")
            for string in line.iterfind("alto:String", namespaces=nsmap)
        )
        normalized_text = normalize_sbb(line_text)
@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
-            [make_segment(no, line) for no, line in enumerate(f.readlines())],
+            [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
            "\n",
            None,
            None,
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@ -177,8 +177,8 @@ def test_text():
 def test_plain(tmp_path):
    with working_directory(tmp_path):
        with open("ocr.txt", "w") as ocrf:
-            ocrf.write("AAAAB")
+            ocrf.write("First, a line.\nAnd a second line.\n")

        result = plain_text("ocr.txt")
-        expected = "AAAAB"
+        expected = "First, a line.\nAnd a second line."
        assert result == expected