🐛 Fix reading plain text files

As reported by @tallemeersch in gh-107, newlines were not removed for plain text files.
Fix this by stripping the lines as suggested.

Fixes gh-107.
pull/116/head
Mike Gerber 8 months ago
parent 41a0fad352
commit b336f98271

@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get("ID") line_id = line.attrib.get("ID")
line_text = " ".join( line_text = " ".join(
string.attrib.get("CONTENT") string.attrib.get("CONTENT", "")
for string in line.iterfind("alto:String", namespaces=nsmap) for string in line.iterfind("alto:String", namespaces=nsmap)
) )
normalized_text = normalize_sbb(line_text) normalized_text = normalize_sbb(line_text)
@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
with open(filename, "r", encoding=fileencoding) as f: with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText( return ExtractedText(
None, None,
[make_segment(no, line) for no, line in enumerate(f.readlines())], [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
"\n", "\n",
None, None,
None, None,

@ -177,8 +177,8 @@ def test_text():
def test_plain(tmp_path): def test_plain(tmp_path):
with working_directory(tmp_path): with working_directory(tmp_path):
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB") ocrf.write("First, a line.\nAnd a second line.\n")
result = plain_text("ocr.txt") result = plain_text("ocr.txt")
expected = "AAAAB" expected = "First, a line.\nAnd a second line."
assert result == expected assert result == expected

Loading…
Cancel
Save