port processor to core v3
parent
78bfa97c06
commit
0a3f525f0a
@ -1,68 +1,35 @@
|
|||||||
from json import loads
|
from typing import Optional
|
||||||
from pkg_resources import resource_string
|
from ocrd.processor.ocrd_page_result import OcrdPageResult
|
||||||
from tempfile import NamedTemporaryFile
|
from ocrd_models import OcrdPage
|
||||||
from pathlib import Path
|
|
||||||
from os.path import join
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from ocrd import Processor
|
from ocrd import Processor
|
||||||
from ocrd_modelfactory import page_from_file, exif_from_filename
|
|
||||||
from ocrd_models import OcrdFile, OcrdExif
|
|
||||||
from ocrd_models.ocrd_page import to_xml
|
|
||||||
from ocrd_utils import (
|
|
||||||
getLogger,
|
|
||||||
MIMETYPE_PAGE,
|
|
||||||
assert_file_grp_cardinality,
|
|
||||||
make_file_id
|
|
||||||
)
|
|
||||||
|
|
||||||
from .eynollah import Eynollah
|
from .eynollah import Eynollah
|
||||||
from .utils.pil_cv2 import pil2cv
|
|
||||||
|
|
||||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
|
||||||
|
|
||||||
class EynollahProcessor(Processor):
|
class EynollahProcessor(Processor):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
@property
|
||||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
|
def metadata_location(self) -> str:
|
||||||
kwargs['version'] = OCRD_TOOL['version']
|
return 'eynollah/ocrd-tool.json'
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def process(self):
|
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
||||||
LOG = getLogger('eynollah')
|
assert input_pcgts
|
||||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
assert input_pcgts[0]
|
||||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
pcgts = input_pcgts[0]
|
||||||
for n, input_file in enumerate(self.input_files):
|
|
||||||
page_id = input_file.pageId or input_file.ID
|
|
||||||
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
|
|
||||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
|
||||||
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
|
|
||||||
self.add_metadata(pcgts)
|
|
||||||
page = pcgts.get_Page()
|
page = pcgts.get_Page()
|
||||||
# XXX loses DPI information
|
# XXX loses DPI information
|
||||||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
||||||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename
|
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename
|
||||||
eynollah_kwargs = {
|
Eynollah(
|
||||||
'dir_models': self.resolve_resource(self.parameter['models']),
|
dir_models=self.resolve_resource(self.parameter['models']),
|
||||||
'allow_enhancement': False,
|
allow_enhancement=False,
|
||||||
'curved_line': self.parameter['curved_line'],
|
curved_line=self.parameter['curved_line'],
|
||||||
'full_layout': self.parameter['full_layout'],
|
full_layout=self.parameter['full_layout'],
|
||||||
'allow_scaling': self.parameter['allow_scaling'],
|
allow_scaling=self.parameter['allow_scaling'],
|
||||||
'headers_off': self.parameter['headers_off'],
|
headers_off=self.parameter['headers_off'],
|
||||||
'tables': self.parameter['tables'],
|
tables=self.parameter['tables'],
|
||||||
'override_dpi': self.parameter['dpi'],
|
override_dpi=self.parameter['dpi'],
|
||||||
'logger': LOG,
|
logger=self.logger,
|
||||||
'pcgts': pcgts,
|
pcgts=pcgts,
|
||||||
'image_filename': image_filename
|
image_filename=image_filename
|
||||||
}
|
).run()
|
||||||
Eynollah(**eynollah_kwargs).run()
|
return OcrdPageResult(pcgts)
|
||||||
file_id = make_file_id(input_file, self.output_file_grp)
|
|
||||||
pcgts.set_pcGtsId(file_id)
|
|
||||||
self.workspace.add_file(
|
|
||||||
ID=file_id,
|
|
||||||
file_grp=self.output_file_grp,
|
|
||||||
pageId=page_id,
|
|
||||||
mimetype=MIMETYPE_PAGE,
|
|
||||||
local_filename=join(self.output_file_grp, file_id) + '.xml',
|
|
||||||
content=to_xml(pcgts))
|
|
||||||
|
Loading…
Reference in New Issue