1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-10-26 15:04:15 +01:00

🚧 Add option for text encoding to line dir cli

This commit is contained in:
Gerber, Mike 2025-02-04 13:54:28 +01:00
parent ceea82063a
commit 9114be23de
2 changed files with 39 additions and 10 deletions

View file

@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
def process( def process(
gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None gt_dir,
ocr_dir,
report_prefix,
*,
metrics=True,
gt_suffix=None,
ocr_suffix=None,
plain_encoding="autodetect",
): ):
cer = None cer = None
@ -125,8 +132,12 @@ def process(
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
gt_text = plain_extract(gt_fn, include_filename_in_id=True) gt_text = plain_extract(
ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) gt_fn, include_filename_in_id=True, encoding=plain_encoding
)
ocr_text = plain_extract(
ocr_fn, include_filename_in_id=True, encoding=plain_encoding
)
gt_words: List[str] = list(words_normalized(gt_text)) gt_words: List[str] = list(words_normalized(gt_text))
ocr_words: List[str] = list(words_normalized(ocr_text)) ocr_words: List[str] = list(words_normalized(ocr_text))
@ -202,7 +213,12 @@ def process(
) )
@click.option("--gt-suffix", help="Suffix of GT line text files") @click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files") @click.option("--ocr-suffix", help="Suffix of OCR line text files")
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): @click.option(
"--plain-encoding",
default="autodetect",
help='Encoding (e.g. "utf-8") of plain text files',
)
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
""" """
Compare the GT line text directory against the OCR line text directory. Compare the GT line text directory against the OCR line text directory.
@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
$REPORT_PREFIX defaults to "report". The reports include the character error $REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER). rate (CER) and the word error rate (WER).
It is recommended to specify the encoding of the text files, for example with
--plain-encoding utf-8. If this option is not given, we try to auto-detect it.
""" """
initLogging() initLogging()
process( process(
@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix):
metrics=metrics, metrics=metrics,
gt_suffix=gt_suffix, gt_suffix=gt_suffix,
ocr_suffix=ocr_suffix, ocr_suffix=ocr_suffix,
plain_encoding=plain_encoding,
) )

View file

@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
import chardet import chardet
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb from .extracted_text import ExtractedText, normalize_sbb
log = getLogger("processor.OcrdDinglehopperEvaluate")
def alto_namespace(tree: ET._ElementTree) -> Optional[str]: def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
"""Return the ALTO namespace used in the given ElementTree. """Return the ALTO namespace used in the given ElementTree.
@ -149,7 +152,7 @@ def detect_encoding(filename):
return chardet.detect(open(filename, "rb").read(1024))["encoding"] return chardet.detect(open(filename, "rb").read(1024))["encoding"]
def plain_extract(filename, include_filename_in_id=False): def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
def make_segment(no, line): def make_segment(no, line):
@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False):
clusters, clusters,
) )
if encoding == "autodetect":
fileencoding = detect_encoding(filename) fileencoding = detect_encoding(filename)
log.warn(
f"Autodetected encoding as '{fileencoding}'"
", it is recommended to specify it explicitly with --plain-encoding"
)
else:
fileencoding = encoding
with open(filename, "r", encoding=fileencoding) as f: with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText( return ExtractedText(
None, None,
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
# XXX hardcoded SBB normalization # XXX hardcoded SBB normalization
def plain_text(filename): def plain_text(filename, encoding="autodetect"):
return plain_extract(filename).text return plain_extract(filename, encoding=encoding).text
def extract(filename, *, textequiv_level="region"): def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
"""Extract the text from the given file. """Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text. Supports PAGE, ALTO and falls back to plain text.
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
try: try:
tree = ET.parse(filename) tree = ET.parse(filename)
except (XMLSyntaxError, UnicodeDecodeError): except (XMLSyntaxError, UnicodeDecodeError):
return plain_extract(filename) return plain_extract(filename, encoding=plain_encoding)
try: try:
return page_extract(tree, textequiv_level=textequiv_level) return page_extract(tree, textequiv_level=textequiv_level)
except ValueError: except ValueError: