diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 5cd1bfa..4064de0 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): def process( - gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None + gt_dir, + ocr_dir, + report_prefix, + *, + metrics=True, + gt_suffix=None, + ocr_suffix=None, + plain_encoding="autodetect", ): cer = None @@ -125,8 +132,12 @@ def process( gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): - gt_text = plain_extract(gt_fn, include_filename_in_id=True) - ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) + gt_text = plain_extract( + gt_fn, include_filename_in_id=True, encoding=plain_encoding + ) + ocr_text = plain_extract( + ocr_fn, include_filename_in_id=True, encoding=plain_encoding + ) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -202,7 +213,12 @@ def process( ) @click.option("--gt-suffix", help="Suffix of GT line text files") @click.option("--ocr-suffix", help="Suffix of OCR line text files") -def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) +def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): """ Compare the GT line text directory against the OCR line text directory. @@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). + It is recommended to specify the encoding of the text files, for example with + --plain-encoding utf-8. If this option is not given, we try to auto-detect it. """ initLogging() process( @@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix, + plain_encoding=plain_encoding, ) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 1593f44..1eecebb 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional import chardet from lxml import etree as ET from lxml.etree import XMLSyntaxError +from ocrd_utils import getLogger from uniseg.graphemecluster import grapheme_clusters from .extracted_text import ExtractedText, normalize_sbb +log = getLogger("processor.OcrdDinglehopperEvaluate") + def alto_namespace(tree: ET._ElementTree) -> Optional[str]: """Return the ALTO namespace used in the given ElementTree. @@ -149,7 +152,7 @@ def detect_encoding(filename): return chardet.detect(open(filename, "rb").read(1024))["encoding"] -def plain_extract(filename, include_filename_in_id=False): +def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"): id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" def make_segment(no, line): @@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False): clusters, ) - fileencoding = detect_encoding(filename) + if encoding == "autodetect": + fileencoding = detect_encoding(filename) + log.warn( + f"Autodetected encoding as '{fileencoding}'" + ", it is recommended to specify it explicitly with --plain-encoding" + ) + else: + fileencoding = encoding with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, @@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False): # XXX hardcoded SBB normalization -def plain_text(filename): - return plain_extract(filename).text +def plain_text(filename, encoding="autodetect"): + return plain_extract(filename, encoding=encoding).text -def extract(filename, *, textequiv_level="region"): +def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"): """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. @@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"): try: tree = ET.parse(filename) except (XMLSyntaxError, UnicodeDecodeError): - return plain_extract(filename) + return plain_extract(filename, encoding=plain_encoding) try: return page_extract(tree, textequiv_level=textequiv_level) except ValueError: