port processor to core v3

refactoring-2024-08-merged
kba 4 months ago
parent 78bfa97c06
commit 0a3f525f0a

@ -1,68 +1,35 @@
from json import loads from typing import Optional
from pkg_resources import resource_string from ocrd.processor.ocrd_page_result import OcrdPageResult
from tempfile import NamedTemporaryFile from ocrd_models import OcrdPage
from pathlib import Path
from os.path import join
from PIL import Image
from ocrd import Processor from ocrd import Processor
from ocrd_modelfactory import page_from_file, exif_from_filename
from ocrd_models import OcrdFile, OcrdExif
from ocrd_models.ocrd_page import to_xml
from ocrd_utils import (
getLogger,
MIMETYPE_PAGE,
assert_file_grp_cardinality,
make_file_id
)
from .eynollah import Eynollah from .eynollah import Eynollah
from .utils.pil_cv2 import pil2cv
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
class EynollahProcessor(Processor): class EynollahProcessor(Processor):
def __init__(self, *args, **kwargs): @property
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] def metadata_location(self) -> str:
kwargs['version'] = OCRD_TOOL['version'] return 'eynollah/ocrd-tool.json'
super().__init__(*args, **kwargs)
def process(self): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
LOG = getLogger('eynollah') assert input_pcgts
assert_file_grp_cardinality(self.input_file_grp, 1) assert input_pcgts[0]
assert_file_grp_cardinality(self.output_file_grp, 1) pcgts = input_pcgts[0]
for n, input_file in enumerate(self.input_files): page = pcgts.get_Page()
page_id = input_file.pageId or input_file.ID # XXX loses DPI information
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
pcgts = page_from_file(self.workspace.download_file(input_file)) image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) Eynollah(
self.add_metadata(pcgts) dir_models=self.resolve_resource(self.parameter['models']),
page = pcgts.get_Page() allow_enhancement=False,
# XXX loses DPI information curved_line=self.parameter['curved_line'],
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') full_layout=self.parameter['full_layout'],
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename allow_scaling=self.parameter['allow_scaling'],
eynollah_kwargs = { headers_off=self.parameter['headers_off'],
'dir_models': self.resolve_resource(self.parameter['models']), tables=self.parameter['tables'],
'allow_enhancement': False, override_dpi=self.parameter['dpi'],
'curved_line': self.parameter['curved_line'], logger=self.logger,
'full_layout': self.parameter['full_layout'], pcgts=pcgts,
'allow_scaling': self.parameter['allow_scaling'], image_filename=image_filename
'headers_off': self.parameter['headers_off'], ).run()
'tables': self.parameter['tables'], return OcrdPageResult(pcgts)
'override_dpi': self.parameter['dpi'],
'logger': LOG,
'pcgts': pcgts,
'image_filename': image_filename
}
Eynollah(**eynollah_kwargs).run()
file_id = make_file_id(input_file, self.output_file_grp)
pcgts.set_pcGtsId(file_id)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=page_id,
mimetype=MIMETYPE_PAGE,
local_filename=join(self.output_file_grp, file_id) + '.xml',
content=to_xml(pcgts))

@ -1,5 +1,5 @@
# ocrd includes opencv, numpy, shapely, click # ocrd includes opencv, numpy, shapely, click
ocrd >= 2.23.3 ocrd >= 3.0.0a2
numpy <1.24.0 numpy <1.24.0
scikit-learn >= 0.23.2 scikit-learn >= 0.23.2
tensorflow == 2.12.1 tensorflow == 2.12.1

Loading…
Cancel
Save