From 69325facf2a8e045b80c5f79679262a27b30e3eb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 3 Aug 2023 17:48:13 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Detect=20encoding=20(incl=20BOM)?= =?UTF-8?q?=20when=20reading=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As @imlabormitlea-code reported in gh-79, dinglehopper did not handle text files with BOM well. Fix this by using chardet to detect an encoding, which also detects the BOM and use the proper encoding to read the files, not including the BOM in the resulting extracted text. Fixes gh-80. --- requirements.txt | 1 + src/dinglehopper/ocr_files.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index daf2b0f..8ee3d1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ multimethod == 1.3 # latest version to officially support Python 3.5 tqdm rapidfuzz >= 2.4.2 six # XXX workaround OCR-D/core#730 +chardet diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 97e56ed..42a085f 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -7,6 +7,7 @@ from warnings import warn from lxml import etree as ET from lxml.etree import XMLSyntaxError +import chardet from .extracted_text import ExtractedText, normalize_sbb @@ -135,9 +136,15 @@ def page_text(tree, *, textequiv_level="region"): return page_extract(tree, textequiv_level=textequiv_level).text +def detect_encoding(filename): + return chardet.detect(open(filename, "rb").read(1024))["encoding"] + + def plain_extract(filename, include_filename_in_id=False): id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" - with open(filename, "r") as f: + + fileencoding = detect_encoding(filename) + with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, [ @@ -166,7 +173,7 @@ def extract(filename, *, textequiv_level="region"): """ try: tree = ET.parse(filename) - except XMLSyntaxError: + except (XMLSyntaxError, UnicodeDecodeError): return plain_extract(filename) try: return page_extract(tree, textequiv_level=textequiv_level)