This commit is contained in:
kba 2026-04-28 12:18:29 +02:00
parent 957dc66e7c
commit a9e12a63da
3 changed files with 129 additions and 0 deletions

View file

@ -42,6 +42,7 @@ classifiers = [
eynollah = "eynollah.cli:main"
eynollah-training = "eynollah.training.cli:main"
ocrd-eynollah-segment = "eynollah.ocrd_cli_segment:main"
ocrd-eynollah-recognize = "eynollah.ocrd_cli_recognize:main"
ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main"
[project.urls]

View file

@ -163,5 +163,42 @@
}
]
}
},
"ocrd-eynollah-recognize": {
"executable": "ocrd-eynollah-recognize",
"categories": ["Text recognition and optimization"],
"steps": ["recognition/text-recognition"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"parameters": {
"models": {
"type": "string",
"format": "uri",
"content-type": "text/directory",
"cacheable": true,
"description": "Directory containing the eynollah_models directory",
"required": true
},
"do_not_mask_with_textline_contour": {
"type": "boolean",
"description": "if this parameter set to true, cropped textline images will not be masked with textline contour.",
"default": false
},
"tr_ocr": {
"type": "boolean",
"description": "Whether to use (much more resource-intensive) transformer model",
"default": false
}
},
"resources": [
{
"url": "https://zenodo.org/records/17580627/files/models_ocr_v0_6_0.tar.gz?download=1",
"name": "models_ocr_v0_6_0",
"type": "archive",
"size": 6119874002,
"description": "Models for OCR",
"version_range": ">= v0.6.0"
}
]
}
}

View file

@ -0,0 +1,91 @@
from functools import cached_property
from pathlib import Path
from typing import Optional
from ocrd.workspace import page_from_file
from ocrd_models import OcrdFileType, OcrdPage
from ocrd import Processor
from ocrd_utils import (
make_file_id,
)
from eynollah.eynollah_ocr import Eynollah_ocr
from eynollah.model_zoo.model_zoo import EynollahModelZoo
from eynollah.utils.pil_cv2 import pil2cv
from eynollah.utils.xml import etree_namespace_for_element_tag
class EynollahRecognizeProcessor(Processor):
@cached_property
def executable(self):
return 'ocrd-eynollah-recognize'
def setup(self):
"""
Load model, set predict function
"""
assert self.parameter
model_zoo = EynollahModelZoo(basedir=self.parameter['models'])
assert self.parameter
self.eynollah_ocr = Eynollah_ocr(
model_zoo=model_zoo,
tr_ocr=self.parameter['tr_ocr'],
do_not_mask_with_textline_contour=self.parameter['do_not_mask_with_textline_contour'],
batch_size=self.parameter['batch_size'],
min_conf_value_of_textline_text=self.parameter['min_conf_value_of_textline_text'])
# FIXME: This is just a proof-of-concept, very inefficient and non-conformant
# TODO: OCR writing should use PAGE API once result dataclass mechanism is settled,
# then simplify/port to proces_page_pcgts
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
assert self.workspace
page_file = input_files[0]
assert page_file
page = page_from_file(page_file)
assert page
page_image, page_coords, _ = self.workspace.image_from_page(
page, page_file.pageId,
feature_selector="")
page_ns = etree_namespace_for_element_tag(page.etree.getroot().tag)
img = pil2cv(page_image)
if self.eynollah_ocr.tr_ocr:
result = self.eynollah_ocr.run_trocr(
img=img,
page_tree=page.etree,
page_ns=page_ns,
tr_ocr_input_height_and_width = 384
)
else:
page_image_bin, _, _ = self.workspace.image_from_page(
page, page_file.pageId,
feature_selector="binarized")
result = self.eynollah_ocr.run_cnn(
img=img,
page_tree=page.etree,
page_ns=page_ns,
img_bin=pil2cv(page_image_bin),
image_width=512,
image_height=32,
)
output_file_id = make_file_id(page_file, self.output_file_grp)
output_filename = Path(self.output_file_grp, output_file_id + '.xml')
output_filename.parent.mkdir()
self.eynollah_ocr.write_ocr(
result=result,
img=img,
page_tree=page.etree,
page_ns=page_ns,
out_file_ocr=str(output_filename),
out_image_with_text=None,
)
self.workspace.add_file(
file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_file.pageId,
local_filename=output_filename,
mimetype=page_ns,
)