processor: reuse loaded models across pages, use derived images

# Conflicts:
#	qurator/eynollah/processor.py
pull/134/head
Robert Sachunsky 2 years ago committed by kba
parent c37d95dedf
commit 61bcb435ae

@ -12,6 +12,8 @@ class EynollahProcessor(Processor):
return 'eynollah/ocrd-tool.json' return 'eynollah/ocrd-tool.json'
def setup(self) -> None: def setup(self) -> None:
# for caching models
self.models = None
if self.parameter['textline_light'] and not self.parameter['light_mode']: if self.parameter['textline_light'] and not self.parameter['light_mode']:
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled") raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled")
@ -21,14 +23,19 @@ class EynollahProcessor(Processor):
assert self.parameter assert self.parameter
pcgts = input_pcgts[0] pcgts = input_pcgts[0]
page = pcgts.get_Page() page = pcgts.get_Page()
# if not('://' in page.imageFilename):
# image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename
# else:
# # could be a URL with file:// or truly remote
# image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
# XXX loses DPI information # XXX loses DPI information
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') page_image, _, _ = self.workspace.image_from_page(
if not('://' in page.imageFilename): page, page_id,
image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename # avoid any features that would change the coordinate system: cropped,deskewed
else: # (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
# could be a URL with file:// or truly remote # also avoid binarization as models usually fare better on grayscale/RGB
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename feature_filter='cropped,deskewed,binarized')
Eynollah( eynollah = Eynollah(
self.resolve_resource(self.parameter['models']), self.resolve_resource(self.parameter['models']),
self.logger, self.logger,
allow_enhancement=self.parameter['allow_enhancement'], allow_enhancement=self.parameter['allow_enhancement'],
@ -43,6 +50,12 @@ class EynollahProcessor(Processor):
tables=self.parameter['tables'], tables=self.parameter['tables'],
override_dpi=self.parameter['dpi'], override_dpi=self.parameter['dpi'],
pcgts=pcgts, pcgts=pcgts,
image_filename=str(image_filename) image_filename=page.imageFilename,
).run() image_pil=page_image
)
if self.models is not None:
# reuse loaded models from previous page
eynollah.models = self.models
eynollah.run()
self.models = eynollah.models
return OcrdPageResult(pcgts) return OcrdPageResult(pcgts)

Loading…
Cancel
Save