mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-31 09:24:15 +01:00 
			
		
		
		
	🐛 Fix reading plain text files
As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107.
This commit is contained in:
		
							parent
							
								
									41a0fad352
								
							
						
					
					
						commit
						b336f98271
					
				
					 2 changed files with 4 additions and 4 deletions
				
			
		|  | @ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]: | |||
|     for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): | ||||
|         line_id = line.attrib.get("ID") | ||||
|         line_text = " ".join( | ||||
|             string.attrib.get("CONTENT") | ||||
|             string.attrib.get("CONTENT", "") | ||||
|             for string in line.iterfind("alto:String", namespaces=nsmap) | ||||
|         ) | ||||
|         normalized_text = normalize_sbb(line_text) | ||||
|  | @ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False): | |||
|     with open(filename, "r", encoding=fileencoding) as f: | ||||
|         return ExtractedText( | ||||
|             None, | ||||
|             [make_segment(no, line) for no, line in enumerate(f.readlines())], | ||||
|             [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], | ||||
|             "\n", | ||||
|             None, | ||||
|             None, | ||||
|  |  | |||
|  | @ -177,8 +177,8 @@ def test_text(): | |||
| def test_plain(tmp_path): | ||||
|     with working_directory(tmp_path): | ||||
|         with open("ocr.txt", "w") as ocrf: | ||||
|             ocrf.write("AAAAB") | ||||
|             ocrf.write("First, a line.\nAnd a second line.\n") | ||||
| 
 | ||||
|         result = plain_text("ocr.txt") | ||||
|         expected = "AAAAB" | ||||
|         expected = "First, a line.\nAnd a second line." | ||||
|         assert result == expected | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue