diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 0c4fa04..1593f44 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]: for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): line_id = line.attrib.get("ID") line_text = " ".join( - string.attrib.get("CONTENT") + string.attrib.get("CONTENT", "") for string in line.iterfind("alto:String", namespaces=nsmap) ) normalized_text = normalize_sbb(line_text) @@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False): with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, - [make_segment(no, line) for no, line in enumerate(f.readlines())], + [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], "\n", None, None, diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 4790c85..342507a 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -177,8 +177,8 @@ def test_text(): def test_plain(tmp_path): with working_directory(tmp_path): with open("ocr.txt", "w") as ocrf: - ocrf.write("AAAAB") + ocrf.write("First, a line.\nAnd a second line.\n") result = plain_text("ocr.txt") - expected = "AAAAB" + expected = "First, a line.\nAnd a second line." assert result == expected