From 14a4bc56d85bd953153bf64bcb95a92413814efb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 18:24:35 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Add=20--plain-encoding=20option?= =?UTF-8?q?=20to=20dinglehopper-extract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_extract.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py index 9c51d34..5fce032 100644 --- a/src/dinglehopper/cli_extract.py +++ b/src/dinglehopper/cli_extract.py @@ -12,7 +12,12 @@ from .ocr_files import extract help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) -def main(input_file, textequiv_level): +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) +def main(input_file, textequiv_level, plain_encoding): """ Extract the text of the given INPUT_FILE. @@ -23,7 +28,9 @@ def main(input_file, textequiv_level): use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() - input_text = extract(input_file, textequiv_level=textequiv_level).text + input_text = extract( + input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ).text print(input_text)