diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py
index 5cd1bfa..4064de0 100644
--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
 
 
 def process(
-    gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
 ):
 
     cer = None
@@ -125,8 +132,12 @@ def process(
         gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
 
     for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
-        gt_text = plain_extract(gt_fn, include_filename_in_id=True)
-        ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
+        ocr_text = plain_extract(
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
         gt_words: List[str] = list(words_normalized(gt_text))
         ocr_words: List[str] = list(words_normalized(ocr_text))
 
@@ -202,7 +213,12 @@ def process(
 )
 @click.option("--gt-suffix", help="Suffix of GT line text files")
 @click.option("--ocr-suffix", help="Suffix of OCR line text files")
-def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding  (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
     """
     Compare the GT line text directory against the OCR line text directory.
 
@@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
     $REPORT_PREFIX defaults to "report". The reports include the character error
     rate (CER) and the word error rate (WER).
 
+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
     """
     initLogging()
     process(
@@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
         metrics=metrics,
         gt_suffix=gt_suffix,
         ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
     )
 
 
diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py
index 1593f44..1eecebb 100644
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
+log = getLogger("processor.OcrdDinglehopperEvaluate")
+
 
 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
     """Return the ALTO namespace used in the given ElementTree.
@@ -149,7 +152,7 @@ def detect_encoding(filename):
     return chardet.detect(open(filename, "rb").read(1024))["encoding"]
 
 
-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
     id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
 
     def make_segment(no, line):
@@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
             clusters,
         )
 
-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
+        fileencoding = detect_encoding(filename)
+        log.warn(
+            f"Autodetected encoding as '{fileencoding}'"
+            ", it is recommended to specify it explicitly with --plain-encoding"
+        )
+    else:
+        fileencoding = encoding
     with open(filename, "r", encoding=fileencoding) as f:
         return ExtractedText(
             None,
@@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
     # XXX hardcoded SBB normalization
 
 
-def plain_text(filename):
-    return plain_extract(filename).text
+def plain_text(filename, encoding="autodetect"):
+    return plain_extract(filename, encoding=encoding).text
 
 
-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
     """Extract the text from the given file.
 
     Supports PAGE, ALTO and falls back to plain text.
@@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
     try:
         tree = ET.parse(filename)
     except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
     try:
         return page_extract(tree, textequiv_level=textequiv_level)
     except ValueError: