From 0a3f525f0a2c8efbdfe55c5a27c3e8ac526662f9 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 23 Aug 2024 18:19:28 +0200 Subject: [PATCH] port processor to core v3 --- qurator/eynollah/processor.py | 89 +++++++++++------------------------ requirements.txt | 2 +- 2 files changed, 29 insertions(+), 62 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 1bd190e..c8748af 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -1,68 +1,35 @@ -from json import loads -from pkg_resources import resource_string -from tempfile import NamedTemporaryFile -from pathlib import Path -from os.path import join - -from PIL import Image - +from typing import Optional +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models import OcrdPage from ocrd import Processor -from ocrd_modelfactory import page_from_file, exif_from_filename -from ocrd_models import OcrdFile, OcrdExif -from ocrd_models.ocrd_page import to_xml -from ocrd_utils import ( - getLogger, - MIMETYPE_PAGE, - assert_file_grp_cardinality, - make_file_id -) from .eynollah import Eynollah -from .utils.pil_cv2 import pil2cv - -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) class EynollahProcessor(Processor): - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) + @property + def metadata_location(self) -> str: + return 'eynollah/ocrd-tool.json' - def process(self): - LOG = getLogger('eynollah') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) - self.add_metadata(pcgts) - page = pcgts.get_Page() - # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename - eynollah_kwargs = { - 'dir_models': self.resolve_resource(self.parameter['models']), - 'allow_enhancement': False, - 'curved_line': self.parameter['curved_line'], - 'full_layout': self.parameter['full_layout'], - 'allow_scaling': self.parameter['allow_scaling'], - 'headers_off': self.parameter['headers_off'], - 'tables': self.parameter['tables'], - 'override_dpi': self.parameter['dpi'], - 'logger': LOG, - 'pcgts': pcgts, - 'image_filename': image_filename - } - Eynollah(**eynollah_kwargs).run() - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id) + '.xml', - content=to_xml(pcgts)) + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts + assert input_pcgts[0] + pcgts = input_pcgts[0] + page = pcgts.get_Page() + # XXX loses DPI information + # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename + Eynollah( + dir_models=self.resolve_resource(self.parameter['models']), + allow_enhancement=False, + curved_line=self.parameter['curved_line'], + full_layout=self.parameter['full_layout'], + allow_scaling=self.parameter['allow_scaling'], + headers_off=self.parameter['headers_off'], + tables=self.parameter['tables'], + override_dpi=self.parameter['dpi'], + logger=self.logger, + pcgts=pcgts, + image_filename=image_filename + ).run() + return OcrdPageResult(pcgts) diff --git a/requirements.txt b/requirements.txt index f01d319..feeea99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # ocrd includes opencv, numpy, shapely, click -ocrd >= 2.23.3 +ocrd >= 3.0.0a2 numpy <1.24.0 scikit-learn >= 0.23.2 tensorflow == 2.12.1