🐛 Detect encoding (incl BOM) when reading files

As @imlabormitlea-code reported in gh-79, dinglehopper did not handle text files with
BOM well. Fix this by using chardet to detect an encoding, which also detects the BOM
and use the proper encoding to read the files, not including the BOM in the resulting
extracted text.

Fixes gh-80.
pull/90/head
Mike Gerber 9 months ago
parent 325e5af5f5
commit 69325facf2

@ -11,3 +11,4 @@ multimethod == 1.3 # latest version to officially support Python 3.5
tqdm
rapidfuzz >= 2.4.2
six # XXX workaround OCR-D/core#730
chardet

@ -7,6 +7,7 @@ from warnings import warn
from lxml import etree as ET
from lxml.etree import XMLSyntaxError
import chardet
from .extracted_text import ExtractedText, normalize_sbb
@ -135,9 +136,15 @@ def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text
def detect_encoding(filename):
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
def plain_extract(filename, include_filename_in_id=False):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
with open(filename, "r") as f:
fileencoding = detect_encoding(filename)
with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText(
None,
[
@ -166,7 +173,7 @@ def extract(filename, *, textequiv_level="region"):
"""
try:
tree = ET.parse(filename)
except XMLSyntaxError:
except (XMLSyntaxError, UnicodeDecodeError):
return plain_extract(filename)
try:
return page_extract(tree, textequiv_level=textequiv_level)

Loading…
Cancel
Save