mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-09 20:29:55 +02:00
processor: reuse loaded models across pages, use derived images
# Conflicts: # qurator/eynollah/processor.py
This commit is contained in:
parent
c37d95dedf
commit
61bcb435ae
1 changed files with 22 additions and 9 deletions
|
@ -12,6 +12,8 @@ class EynollahProcessor(Processor):
|
||||||
return 'eynollah/ocrd-tool.json'
|
return 'eynollah/ocrd-tool.json'
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
|
# for caching models
|
||||||
|
self.models = None
|
||||||
if self.parameter['textline_light'] and not self.parameter['light_mode']:
|
if self.parameter['textline_light'] and not self.parameter['light_mode']:
|
||||||
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled")
|
raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled")
|
||||||
|
|
||||||
|
@ -21,14 +23,19 @@ class EynollahProcessor(Processor):
|
||||||
assert self.parameter
|
assert self.parameter
|
||||||
pcgts = input_pcgts[0]
|
pcgts = input_pcgts[0]
|
||||||
page = pcgts.get_Page()
|
page = pcgts.get_Page()
|
||||||
|
# if not('://' in page.imageFilename):
|
||||||
|
# image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename
|
||||||
|
# else:
|
||||||
|
# # could be a URL with file:// or truly remote
|
||||||
|
# image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
|
||||||
# XXX loses DPI information
|
# XXX loses DPI information
|
||||||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
page_image, _, _ = self.workspace.image_from_page(
|
||||||
if not('://' in page.imageFilename):
|
page, page_id,
|
||||||
image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename
|
# avoid any features that would change the coordinate system: cropped,deskewed
|
||||||
else:
|
# (the PAGE builder merely adds regions, so afterwards we would not know which to transform)
|
||||||
# could be a URL with file:// or truly remote
|
# also avoid binarization as models usually fare better on grayscale/RGB
|
||||||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
|
feature_filter='cropped,deskewed,binarized')
|
||||||
Eynollah(
|
eynollah = Eynollah(
|
||||||
self.resolve_resource(self.parameter['models']),
|
self.resolve_resource(self.parameter['models']),
|
||||||
self.logger,
|
self.logger,
|
||||||
allow_enhancement=self.parameter['allow_enhancement'],
|
allow_enhancement=self.parameter['allow_enhancement'],
|
||||||
|
@ -43,6 +50,12 @@ class EynollahProcessor(Processor):
|
||||||
tables=self.parameter['tables'],
|
tables=self.parameter['tables'],
|
||||||
override_dpi=self.parameter['dpi'],
|
override_dpi=self.parameter['dpi'],
|
||||||
pcgts=pcgts,
|
pcgts=pcgts,
|
||||||
image_filename=str(image_filename)
|
image_filename=page.imageFilename,
|
||||||
).run()
|
image_pil=page_image
|
||||||
|
)
|
||||||
|
if self.models is not None:
|
||||||
|
# reuse loaded models from previous page
|
||||||
|
eynollah.models = self.models
|
||||||
|
eynollah.run()
|
||||||
|
self.models = eynollah.models
|
||||||
return OcrdPageResult(pcgts)
|
return OcrdPageResult(pcgts)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue