From b336f98271036830dcf5d2456ffa8b87752e9c16 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 6 May 2024 18:14:16 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20reading=20plain=20text=20f?= =?UTF-8?q?iles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107. --- src/dinglehopper/ocr_files.py | 4 ++-- src/dinglehopper/tests/test_ocr_files.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 0c4fa04..1593f44 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]: for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): line_id = line.attrib.get("ID") line_text = " ".join( - string.attrib.get("CONTENT") + string.attrib.get("CONTENT", "") for string in line.iterfind("alto:String", namespaces=nsmap) ) normalized_text = normalize_sbb(line_text) @@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False): with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, - [make_segment(no, line) for no, line in enumerate(f.readlines())], + [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], "\n", None, None, diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 4790c85..342507a 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -177,8 +177,8 @@ def test_text(): def test_plain(tmp_path): with working_directory(tmp_path): with open("ocr.txt", "w") as ocrf: - ocrf.write("AAAAB") + ocrf.write("First, a line.\nAnd a second line.\n") result = plain_text("ocr.txt") - expected = "AAAAB" + expected = "First, a line.\nAnd a second line." assert result == expected