From b336f98271036830dcf5d2456ffa8b87752e9c16 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 6 May 2024 18:14:16 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20reading=20plain=20text=20f?=
 =?UTF-8?q?iles?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As reported by @tallemeersch in gh-107, newlines were not removed for plain text files.
Fix this by stripping the lines as suggested.

Fixes gh-107.
---
 src/dinglehopper/ocr_files.py            | 4 ++--
 src/dinglehopper/tests/test_ocr_files.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 0c4fa04..1593f44 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
     for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
         line_id = line.attrib.get("ID")
         line_text = " ".join(
-            string.attrib.get("CONTENT")
+            string.attrib.get("CONTENT", "")
             for string in line.iterfind("alto:String", namespaces=nsmap)
         )
         normalized_text = normalize_sbb(line_text)
@@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
     with open(filename, "r", encoding=fileencoding) as f:
         return ExtractedText(
             None,
-            [make_segment(no, line) for no, line in enumerate(f.readlines())],
+            [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
             "\n",
             None,
             None,
diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py
index 4790c85..342507a 100644
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@@ -177,8 +177,8 @@ def test_text():
 def test_plain(tmp_path):
     with working_directory(tmp_path):
         with open("ocr.txt", "w") as ocrf:
-            ocrf.write("AAAAB")
+            ocrf.write("First, a line.\nAnd a second line.\n")
 
         result = plain_text("ocr.txt")
-        expected = "AAAAB"
+        expected = "First, a line.\nAnd a second line."
         assert result == expected