From 9db5b4caf5b6335066e121a231cee1b1298bfbfa Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:48:50 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Add=20OCR-D=20parameter=20for=20?= =?UTF-8?q?plain=20text=20encoding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 5 +++++ src/dinglehopper/ocrd_cli.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index 43795e1..ae7c9bb 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -25,6 +25,11 @@ "enum": ["region", "line"], "default": "region", "description": "PAGE XML hierarchy level to extract the text from" + }, + "plain_encoding": { + "type": "string", + "default": "autodetect", + "description": "Encoding (e.g. \"utf-8\") of plain text files" } } } diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index fa4747f..2d7da8e 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -26,6 +26,7 @@ class OcrdDinglehopperEvaluate(Processor): assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] + plain_encoding = self.parameter["plain_encoding"] # wrong number of inputs: let fail gt_file, ocr_file = input_files @@ -52,6 +53,7 @@ class OcrdDinglehopperEvaluate(Processor): self.output_file_grp, metrics=metrics, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) # Add reports to the workspace