mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 03:40:12 +02:00
🚧 Add option for text encoding to line dir cli
This commit is contained in:
parent
ceea82063a
commit
9114be23de
2 changed files with 39 additions and 10 deletions
|
@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
|
|||
|
||||
|
||||
def process(
|
||||
gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
|
||||
gt_dir,
|
||||
ocr_dir,
|
||||
report_prefix,
|
||||
*,
|
||||
metrics=True,
|
||||
gt_suffix=None,
|
||||
ocr_suffix=None,
|
||||
plain_encoding="autodetect",
|
||||
):
|
||||
|
||||
cer = None
|
||||
|
@ -125,8 +132,12 @@ def process(
|
|||
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
|
||||
|
||||
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
|
||||
gt_text = plain_extract(gt_fn, include_filename_in_id=True)
|
||||
ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
|
||||
gt_text = plain_extract(
|
||||
gt_fn, include_filename_in_id=True, encoding=plain_encoding
|
||||
)
|
||||
ocr_text = plain_extract(
|
||||
ocr_fn, include_filename_in_id=True, encoding=plain_encoding
|
||||
)
|
||||
gt_words: List[str] = list(words_normalized(gt_text))
|
||||
ocr_words: List[str] = list(words_normalized(ocr_text))
|
||||
|
||||
|
@ -202,7 +213,12 @@ def process(
|
|||
)
|
||||
@click.option("--gt-suffix", help="Suffix of GT line text files")
|
||||
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
|
||||
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
|
||||
@click.option(
|
||||
"--plain-encoding",
|
||||
default="autodetect",
|
||||
help='Encoding (e.g. "utf-8") of plain text files',
|
||||
)
|
||||
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
|
||||
"""
|
||||
Compare the GT line text directory against the OCR line text directory.
|
||||
|
||||
|
@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
|
|||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||
rate (CER) and the word error rate (WER).
|
||||
|
||||
It is recommended to specify the encoding of the text files, for example with
|
||||
--plain-encoding utf-8. If this option is not given, we try to auto-detect it.
|
||||
"""
|
||||
initLogging()
|
||||
process(
|
||||
|
@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
|
|||
metrics=metrics,
|
||||
gt_suffix=gt_suffix,
|
||||
ocr_suffix=ocr_suffix,
|
||||
plain_encoding=plain_encoding,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
|
|||
import chardet
|
||||
from lxml import etree as ET
|
||||
from lxml.etree import XMLSyntaxError
|
||||
from ocrd_utils import getLogger
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .extracted_text import ExtractedText, normalize_sbb
|
||||
|
||||
log = getLogger("processor.OcrdDinglehopperEvaluate")
|
||||
|
||||
|
||||
def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
|
||||
"""Return the ALTO namespace used in the given ElementTree.
|
||||
|
@ -149,7 +152,7 @@ def detect_encoding(filename):
|
|||
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
|
||||
|
||||
|
||||
def plain_extract(filename, include_filename_in_id=False):
|
||||
def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
|
||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||
|
||||
def make_segment(no, line):
|
||||
|
@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
|
|||
clusters,
|
||||
)
|
||||
|
||||
fileencoding = detect_encoding(filename)
|
||||
if encoding == "autodetect":
|
||||
fileencoding = detect_encoding(filename)
|
||||
log.warn(
|
||||
f"Autodetected encoding as '{fileencoding}'"
|
||||
", it is recommended to specify it explicitly with --plain-encoding"
|
||||
)
|
||||
else:
|
||||
fileencoding = encoding
|
||||
with open(filename, "r", encoding=fileencoding) as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
|
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
|
|||
# XXX hardcoded SBB normalization
|
||||
|
||||
|
||||
def plain_text(filename):
|
||||
return plain_extract(filename).text
|
||||
def plain_text(filename, encoding="autodetect"):
|
||||
return plain_extract(filename, encoding=encoding).text
|
||||
|
||||
|
||||
def extract(filename, *, textequiv_level="region"):
|
||||
def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
|
||||
"""Extract the text from the given file.
|
||||
|
||||
Supports PAGE, ALTO and falls back to plain text.
|
||||
|
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
|
|||
try:
|
||||
tree = ET.parse(filename)
|
||||
except (XMLSyntaxError, UnicodeDecodeError):
|
||||
return plain_extract(filename)
|
||||
return plain_extract(filename, encoding=plain_encoding)
|
||||
try:
|
||||
return page_extract(tree, textequiv_level=textequiv_level)
|
||||
except ValueError:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue