From 323b389ef9c9ba691bf03bdc5de32632d1d9934d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:48:50 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Add=20OCR-D=20parameter=20for=20?= =?UTF-8?q?plain=20text=20encoding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 5 +++++ src/dinglehopper/ocrd_cli.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index f4572c7..d9b3f1f 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -29,6 +29,11 @@ "enum": ["region", "line"], "default": "region", "description": "PAGE XML hierarchy level to extract the text from" + }, + "plain_encoding": { + "type": "string", + "default": "autodetect", + "description": "Encoding (e.g. \"utf-8\") of plain text files" } } } diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 4da4960..2023e03 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -36,6 +36,7 @@ class OcrdDinglehopperEvaluate(Processor): metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] + plain_encoding = self.parameter["plain_encoding"] gt_grp, ocr_grp = self.input_file_grp.split(",") input_file_tuples = self.zip_input_files(on_error="abort") @@ -63,6 +64,7 @@ class OcrdDinglehopperEvaluate(Processor): report_prefix, metrics=metrics, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) # Add reports to the workspace