mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
🐛 Fix reading plain text files
As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107.
This commit is contained in:
parent
41a0fad352
commit
b336f98271
2 changed files with 4 additions and 4 deletions
|
@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
|
||||||
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
|
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
|
||||||
line_id = line.attrib.get("ID")
|
line_id = line.attrib.get("ID")
|
||||||
line_text = " ".join(
|
line_text = " ".join(
|
||||||
string.attrib.get("CONTENT")
|
string.attrib.get("CONTENT", "")
|
||||||
for string in line.iterfind("alto:String", namespaces=nsmap)
|
for string in line.iterfind("alto:String", namespaces=nsmap)
|
||||||
)
|
)
|
||||||
normalized_text = normalize_sbb(line_text)
|
normalized_text = normalize_sbb(line_text)
|
||||||
|
@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
|
||||||
with open(filename, "r", encoding=fileencoding) as f:
|
with open(filename, "r", encoding=fileencoding) as f:
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
None,
|
None,
|
||||||
[make_segment(no, line) for no, line in enumerate(f.readlines())],
|
[make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
|
||||||
"\n",
|
"\n",
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
|
|
|
@ -177,8 +177,8 @@ def test_text():
|
||||||
def test_plain(tmp_path):
|
def test_plain(tmp_path):
|
||||||
with working_directory(tmp_path):
|
with working_directory(tmp_path):
|
||||||
with open("ocr.txt", "w") as ocrf:
|
with open("ocr.txt", "w") as ocrf:
|
||||||
ocrf.write("AAAAB")
|
ocrf.write("First, a line.\nAnd a second line.\n")
|
||||||
|
|
||||||
result = plain_text("ocr.txt")
|
result = plain_text("ocr.txt")
|
||||||
expected = "AAAAB"
|
expected = "First, a line.\nAnd a second line."
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue