🚧 Add option for text encoding to line dir cli

2025-07-12 03:49:57 +02:00 · 2025-02-04 13:54:28 +01:00 · 2025-02-04 13:54:28 +01:00 · 9114be23de
commit 9114be23de
parent ceea82063a
2 changed files with 39 additions and 10 deletions
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):


 def process(
-    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
 ):

    cer = None
@ -125,8 +132,12 @@ def process(
        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)

    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
-        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
-        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
+        ocr_text = plain_extract(
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
        gt_words: List[str] = list(words_normalized(gt_text))
        ocr_words: List[str] = list(words_normalized(ocr_text))

@ -202,7 +213,12 @@ def process(
 )
@click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
-def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding  (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
    """
    Compare the GT line text directory against the OCR line text directory.

@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
    """
    initLogging()
    process(
@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
        metrics=metrics,
        gt_suffix=gt_suffix,
        ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
    )


--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText, normalize_sbb

+log = getLogger("processor.OcrdDinglehopperEvaluate")
+

 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
    """Return the ALTO namespace used in the given ElementTree.
@ -149,7 +152,7 @@ def detect_encoding(filename):
    return chardet.detect(open(filename, "rb").read(1024))["encoding"]


-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"

    def make_segment(no, line):
@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )

-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
+        fileencoding = detect_encoding(filename)
+        log.warn(
+            f"Autodetected encoding as '{fileencoding}'"
+            ", it is recommended to specify it explicitly with --plain-encoding"
+        )
+    else:
+        fileencoding = encoding
    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
    # XXX hardcoded SBB normalization


-def plain_text(filename):
-    return plain_extract(filename).text
+def plain_text(filename, encoding="autodetect"):
+    return plain_extract(filename, encoding=encoding).text


-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
    """Extract the text from the given file.

    Supports PAGE, ALTO and falls back to plain text.
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
    try:
        tree = ET.parse(filename)
    except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
    except ValueError: