mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
🐛 Detect encoding (incl BOM) when reading files
As @imlabormitlea-code reported in gh-79, dinglehopper did not handle text files with BOM well. Fix this by using chardet to detect an encoding, which also detects the BOM and use the proper encoding to read the files, not including the BOM in the resulting extracted text. Fixes gh-80.
This commit is contained in:
parent
325e5af5f5
commit
69325facf2
2 changed files with 10 additions and 2 deletions
|
@ -11,3 +11,4 @@ multimethod == 1.3 # latest version to officially support Python 3.5
|
||||||
tqdm
|
tqdm
|
||||||
rapidfuzz >= 2.4.2
|
rapidfuzz >= 2.4.2
|
||||||
six # XXX workaround OCR-D/core#730
|
six # XXX workaround OCR-D/core#730
|
||||||
|
chardet
|
||||||
|
|
|
@ -7,6 +7,7 @@ from warnings import warn
|
||||||
|
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
|
import chardet
|
||||||
|
|
||||||
from .extracted_text import ExtractedText, normalize_sbb
|
from .extracted_text import ExtractedText, normalize_sbb
|
||||||
|
|
||||||
|
@ -135,9 +136,15 @@ def page_text(tree, *, textequiv_level="region"):
|
||||||
return page_extract(tree, textequiv_level=textequiv_level).text
|
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||||
|
|
||||||
|
|
||||||
|
def detect_encoding(filename):
|
||||||
|
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
|
||||||
|
|
||||||
|
|
||||||
def plain_extract(filename, include_filename_in_id=False):
|
def plain_extract(filename, include_filename_in_id=False):
|
||||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||||
with open(filename, "r") as f:
|
|
||||||
|
fileencoding = detect_encoding(filename)
|
||||||
|
with open(filename, "r", encoding=fileencoding) as f:
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
None,
|
None,
|
||||||
[
|
[
|
||||||
|
@ -166,7 +173,7 @@ def extract(filename, *, textequiv_level="region"):
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tree = ET.parse(filename)
|
tree = ET.parse(filename)
|
||||||
except XMLSyntaxError:
|
except (XMLSyntaxError, UnicodeDecodeError):
|
||||||
return plain_extract(filename)
|
return plain_extract(filename)
|
||||||
try:
|
try:
|
||||||
return page_extract(tree, textequiv_level=textequiv_level)
|
return page_extract(tree, textequiv_level=textequiv_level)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue