From a9e12a63da3e911ad1c4545b901ebbb661b0e5f6 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 28 Apr 2026 12:18:29 +0200 Subject: [PATCH] wp --- pyproject.toml | 1 + src/eynollah/ocrd-tool.json | 37 ++++++++++++ src/eynollah/ocrd_cli_recognize.py | 91 ++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 src/eynollah/ocrd_cli_recognize.py diff --git a/pyproject.toml b/pyproject.toml index ca773a7..e15911f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ classifiers = [ eynollah = "eynollah.cli:main" eynollah-training = "eynollah.training.cli:main" ocrd-eynollah-segment = "eynollah.ocrd_cli_segment:main" +ocrd-eynollah-recognize = "eynollah.ocrd_cli_recognize:main" ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main" [project.urls] diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index fc61af7..c946541 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -163,5 +163,42 @@ } ] } + }, + "ocrd-eynollah-recognize": { + "executable": "ocrd-eynollah-recognize", + "categories": ["Text recognition and optimization"], + "steps": ["recognition/text-recognition"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "parameters": { + "models": { + "type": "string", + "format": "uri", + "content-type": "text/directory", + "cacheable": true, + "description": "Directory containing the eynollah_models directory", + "required": true + }, + "do_not_mask_with_textline_contour": { + "type": "boolean", + "description": "if this parameter set to true, cropped textline images will not be masked with textline contour.", + "default": false + }, + "tr_ocr": { + "type": "boolean", + "description": "Whether to use (much more resource-intensive) transformer model", + "default": false + } + }, + "resources": [ + { + "url": "https://zenodo.org/records/17580627/files/models_ocr_v0_6_0.tar.gz?download=1", + "name": "models_ocr_v0_6_0", + "type": "archive", + "size": 6119874002, + "description": "Models for OCR", + "version_range": ">= v0.6.0" + } + ] } } diff --git a/src/eynollah/ocrd_cli_recognize.py b/src/eynollah/ocrd_cli_recognize.py new file mode 100644 index 0000000..63d9b8c --- /dev/null +++ b/src/eynollah/ocrd_cli_recognize.py @@ -0,0 +1,91 @@ +from functools import cached_property +from pathlib import Path +from typing import Optional +from ocrd.workspace import page_from_file +from ocrd_models import OcrdFileType, OcrdPage + +from ocrd import Processor +from ocrd_utils import ( + make_file_id, +) + +from eynollah.eynollah_ocr import Eynollah_ocr +from eynollah.model_zoo.model_zoo import EynollahModelZoo +from eynollah.utils.pil_cv2 import pil2cv +from eynollah.utils.xml import etree_namespace_for_element_tag + + +class EynollahRecognizeProcessor(Processor): + + @cached_property + def executable(self): + return 'ocrd-eynollah-recognize' + + def setup(self): + """ + Load model, set predict function + """ + assert self.parameter + model_zoo = EynollahModelZoo(basedir=self.parameter['models']) + assert self.parameter + self.eynollah_ocr = Eynollah_ocr( + model_zoo=model_zoo, + tr_ocr=self.parameter['tr_ocr'], + do_not_mask_with_textline_contour=self.parameter['do_not_mask_with_textline_contour'], + batch_size=self.parameter['batch_size'], + min_conf_value_of_textline_text=self.parameter['min_conf_value_of_textline_text']) + + # FIXME: This is just a proof-of-concept, very inefficient and non-conformant + # TODO: OCR writing should use PAGE API once result dataclass mechanism is settled, + # then simplify/port to proces_page_pcgts + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + assert self.workspace + page_file = input_files[0] + assert page_file + page = page_from_file(page_file) + assert page + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_file.pageId, + feature_selector="") + page_ns = etree_namespace_for_element_tag(page.etree.getroot().tag) + + img = pil2cv(page_image) + if self.eynollah_ocr.tr_ocr: + result = self.eynollah_ocr.run_trocr( + img=img, + page_tree=page.etree, + page_ns=page_ns, + + tr_ocr_input_height_and_width = 384 + ) + else: + page_image_bin, _, _ = self.workspace.image_from_page( + page, page_file.pageId, + feature_selector="binarized") + result = self.eynollah_ocr.run_cnn( + img=img, + page_tree=page.etree, + page_ns=page_ns, + + img_bin=pil2cv(page_image_bin), + image_width=512, + image_height=32, + ) + output_file_id = make_file_id(page_file, self.output_file_grp) + output_filename = Path(self.output_file_grp, output_file_id + '.xml') + output_filename.parent.mkdir() + self.eynollah_ocr.write_ocr( + result=result, + img=img, + page_tree=page.etree, + page_ns=page_ns, + out_file_ocr=str(output_filename), + out_image_with_text=None, + ) + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=page_file.pageId, + local_filename=output_filename, + mimetype=page_ns, + )