From eca76a7db22ed33ab2d59d5262f5d04d37eb4a01 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:39:29 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text=20enco?= =?UTF-8?q?ding=20to=20line=20dir=20cli?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index b67e9cc..5e5e81c 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -114,6 +114,7 @@ def process( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", + plain_encoding: str = "autodetect", ) -> None: """Check OCR result against GT. @@ -121,8 +122,12 @@ def process( this undecorated version and use Click on a wrapper. """ - gt_text = extract(gt, textequiv_level=textequiv_level) - ocr_text = extract(ocr, textequiv_level=textequiv_level) + gt_text = extract( + gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ) + ocr_text = extract( + ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -195,6 +200,7 @@ def process_dir( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", + plain_encoding: str = "autodetect", ) -> None: for gt_file in os.listdir(gt): gt_file_path = os.path.join(gt, gt_file) @@ -209,6 +215,7 @@ def process_dir( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) else: print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) @@ -233,6 +240,11 @@ def process_dir( help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.version_option() def main( @@ -243,6 +255,7 @@ def main( metrics, differences, textequiv_level, + plain_encoding, progress, ): """ @@ -280,6 +293,7 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) else: process( @@ -290,6 +304,7 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, )