🐛 Fix reading plain text files

As reported by @tallemeersch in gh-107, newlines were not removed for plain text files.
Fix this by stripping the lines as suggested.

Fixes gh-107.
pull/116/head
Mike Gerber 6 months ago
parent 41a0fad352
commit b336f98271

@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get("ID")
line_text = " ".join(
string.attrib.get("CONTENT")
string.attrib.get("CONTENT", "")
for string in line.iterfind("alto:String", namespaces=nsmap)
)
normalized_text = normalize_sbb(line_text)
@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText(
None,
[make_segment(no, line) for no, line in enumerate(f.readlines())],
[make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
"\n",
None,
None,

@ -177,8 +177,8 @@ def test_text():
def test_plain(tmp_path):
with working_directory(tmp_path):
with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB")
ocrf.write("First, a line.\nAnd a second line.\n")
result = plain_text("ocr.txt")
expected = "AAAAB"
expected = "First, a line.\nAnd a second line."
assert result == expected

Loading…
Cancel
Save