🚧 Add option for text encoding to line dir cli

2026-03-03 13:52:10 +01:00 · 2025-02-04 13:54:28 +01:00 · 2025-02-04 13:54:28 +01:00 · 9114be23de
commit 9114be23de
parent ceea82063a
2 changed files with 39 additions and 10 deletions
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
 def process(
-    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+    gt_dir,
    ocr_dir,
    report_prefix,
    *,
    metrics=True,
    gt_suffix=None,
    ocr_suffix=None,
    plain_encoding="autodetect",
 ):
    cer = None
@ -125,8 +132,12 @@ def process(
        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
-        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
+        gt_text = plain_extract(
-        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
        )
        ocr_text = plain_extract(
            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
        )
        gt_words: List[str] = list(words_normalized(gt_text))
        ocr_words: List[str] = list(words_normalized(ocr_text))
@ -202,7 +213,12 @@ def process(
 )
@click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
-def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
+@click.option(
    "--plain-encoding",
    default="autodetect",
    help='Encoding  (e.g. "utf-8") of plain text files',
 )
 def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
    """
    Compare the GT line text directory against the OCR line text directory.
@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).
    It is recommended to specify the encoding of the text files, for example with
    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
    """
    initLogging()
    process(
@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
        metrics=metrics,
        gt_suffix=gt_suffix,
        ocr_suffix=ocr_suffix,
        plain_encoding=plain_encoding,
    )
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
 from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters
 from .extracted_text import ExtractedText, normalize_sbb
 log = getLogger("processor.OcrdDinglehopperEvaluate")
 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
    """Return the ALTO namespace used in the given ElementTree.
@ -149,7 +152,7 @@ def detect_encoding(filename):
    return chardet.detect(open(filename, "rb").read(1024))["encoding"]
-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
    def make_segment(no, line):
@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )
-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
        fileencoding = detect_encoding(filename)
        log.warn(
            f"Autodetected encoding as '{fileencoding}'"
            ", it is recommended to specify it explicitly with --plain-encoding"
        )
    else:
        fileencoding = encoding
    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
    # XXX hardcoded SBB normalization
-def plain_text(filename):
+def plain_text(filename, encoding="autodetect"):
-    return plain_extract(filename).text
+    return plain_extract(filename, encoding=encoding).text
-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
    """Extract the text from the given file.
    Supports PAGE, ALTO and falls back to plain text.
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
    try:
        tree = ET.parse(filename)
    except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
    except ValueError: