diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 92a91c2..ea144e4 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -12,6 +12,8 @@ class EynollahProcessor(Processor): return 'eynollah/ocrd-tool.json' def setup(self) -> None: + # for caching models + self.models = None if self.parameter['textline_light'] and not self.parameter['light_mode']: raise ValueError("Error: You set parameter 'textline_light' to enable light textline detection but parameter 'light_mode' is not enabled") @@ -21,14 +23,19 @@ class EynollahProcessor(Processor): assert self.parameter pcgts = input_pcgts[0] page = pcgts.get_Page() + # if not('://' in page.imageFilename): + # image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename + # else: + # # could be a URL with file:// or truly remote + # image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - if not('://' in page.imageFilename): - image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename - else: - # could be a URL with file:// or truly remote - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename - Eynollah( + page_image, _, _ = self.workspace.image_from_page( + page, page_id, + # avoid any features that would change the coordinate system: cropped,deskewed + # (the PAGE builder merely adds regions, so afterwards we would not know which to transform) + # also avoid binarization as models usually fare better on grayscale/RGB + feature_filter='cropped,deskewed,binarized') + eynollah = Eynollah( self.resolve_resource(self.parameter['models']), self.logger, allow_enhancement=self.parameter['allow_enhancement'], @@ -43,6 +50,12 @@ class EynollahProcessor(Processor): tables=self.parameter['tables'], override_dpi=self.parameter['dpi'], pcgts=pcgts, - image_filename=str(image_filename) - ).run() + image_filename=page.imageFilename, + image_pil=page_image + ) + if self.models is not None: + # reuse loaded models from previous page + eynollah.models = self.models + eynollah.run() + self.models = eynollah.models return OcrdPageResult(pcgts)