🚧 Add option for text encoding to line dir cli

feat/flex-line-dirs
Gerber, Mike 1 month ago
parent ceea82063a
commit 9114be23de

@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
def process(
gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None
gt_dir,
ocr_dir,
report_prefix,
*,
metrics=True,
gt_suffix=None,
ocr_suffix=None,
plain_encoding="autodetect",
):
cer = None
@ -125,8 +132,12 @@ def process(
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
gt_text = plain_extract(gt_fn, include_filename_in_id=True)
ocr_text = plain_extract(ocr_fn, include_filename_in_id=True)
gt_text = plain_extract(
gt_fn, include_filename_in_id=True, encoding=plain_encoding
)
ocr_text = plain_extract(
ocr_fn, include_filename_in_id=True, encoding=plain_encoding
)
gt_words: List[str] = list(words_normalized(gt_text))
ocr_words: List[str] = list(words_normalized(ocr_text))
@ -202,7 +213,12 @@ def process(
)
@click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
@click.option(
"--plain-encoding",
default="autodetect",
help='Encoding (e.g. "utf-8") of plain text files',
)
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
"""
Compare the GT line text directory against the OCR line text directory.
@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
$REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER).
It is recommended to specify the encoding of the text files, for example with
--plain-encoding utf-8. If this option is not given, we try to auto-detect it.
"""
initLogging()
process(
@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
metrics=metrics,
gt_suffix=gt_suffix,
ocr_suffix=ocr_suffix,
plain_encoding=plain_encoding,
)

@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
import chardet
from lxml import etree as ET
from lxml.etree import XMLSyntaxError
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb
log = getLogger("processor.OcrdDinglehopperEvaluate")
def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
"""Return the ALTO namespace used in the given ElementTree.
@ -149,7 +152,7 @@ def detect_encoding(filename):
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
def plain_extract(filename, include_filename_in_id=False):
def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
def make_segment(no, line):
@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
clusters,
)
fileencoding = detect_encoding(filename)
if encoding == "autodetect":
fileencoding = detect_encoding(filename)
log.warn(
f"Autodetected encoding as '{fileencoding}'"
", it is recommended to specify it explicitly with --plain-encoding"
)
else:
fileencoding = encoding
with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText(
None,
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
# XXX hardcoded SBB normalization
def plain_text(filename):
return plain_extract(filename).text
def plain_text(filename, encoding="autodetect"):
return plain_extract(filename, encoding=encoding).text
def extract(filename, *, textequiv_level="region"):
def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
"""Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text.
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
try:
tree = ET.parse(filename)
except (XMLSyntaxError, UnicodeDecodeError):
return plain_extract(filename)
return plain_extract(filename, encoding=plain_encoding)
try:
return page_extract(tree, textequiv_level=textequiv_level)
except ValueError:

Loading…
Cancel
Save