From 32a4b95a99caa28b94937eb2b869b065bc188550 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 10 Nov 2020 18:51:14 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Normalize=20in?= =?UTF-8?q?=20plain=5Fextract()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/ocr_files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 755061c..4f202ca 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -113,12 +113,13 @@ def plain_extract(filename): return ExtractedText( None, [ - ExtractedText("line %d" % no, None, None, line) + ExtractedText("line %d" % no, None, None, normalize_sbb(line)) for no, line in enumerate(f.readlines()) ], "\n", None, ) + # XXX hardcoded SBB normalization def plain_text(filename):