diff --git a/ocrd-tool.json b/ocrd-tool.json new file mode 120000 index 0000000..5c48493 --- /dev/null +++ b/ocrd-tool.json @@ -0,0 +1 @@ +qurator/eynollah/ocrd-tool.json \ No newline at end of file diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index 07812b2..d59fe89 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -80,9 +80,9 @@ class Eynollah: def __init__( self, image_filename, - image_filename_stem, - dir_out, dir_models, + image_filename_stem=None, + dir_out=None, dir_of_cropped_images=None, dir_of_layout=None, dir_of_deskewed=None, @@ -94,6 +94,7 @@ class Eynollah: allow_scaling=False, headers_off=False, override_dpi=None, + logger=None, ): self.image_filename = image_filename self.dir_out = dir_out @@ -117,7 +118,7 @@ class Eynollah: dir_out=self.dir_out, image_filename=self.image_filename, curved_line=self.curved_line) - self.logger = getLogger('eynollah') + self.logger = logger if logger else getLogger('eynollah') self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5" diff --git a/qurator/eynollah/ocrd-tool.json b/qurator/eynollah/ocrd-tool.json new file mode 100644 index 0000000..12f067b --- /dev/null +++ b/qurator/eynollah/ocrd-tool.json @@ -0,0 +1,54 @@ +{ + "version": "0.0.1", + "git_url": "https://github.com/qurator-spk/eynollah", + "tools": { + "ocrd-eynollah-segment": { + "executable": "ocrd-eynollah-segment", + "categories": ["Layout analysis"], + "description": "Segment page into regions and lines and do reading order detection with eynollah", + "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"], + "output_file_grp": ["OCR-D-SEG-LINE"], + "steps": ["layout/segmentation/region", "layout/segmentation/line"], + "parameters": { + "models": { + "type": "string", + "format": "file", + "cacheable": true, + "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)", + "required": true + }, + "dpi": { + "type": "number", + "format": "float", + "description": "pixel density in dots per inch (overrides any meta-data in the images); disabled when <= 0", + "default": -1 + }, + "full_layout": { + "type": "boolean", + "default": true, + "description": "Try to detect all elements, including drop-caps and marginalia" + }, + "curved_line": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process." + }, + "allow_enhancement": { + "type": "boolean", + "default": true, + "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory" + }, + "allow_scaling": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection" + }, + "headers_off": { + "type": "boolean", + "default": false, + "description": "if this parameter set to true, this tool would ignore headers role in reading order" + } + } + } + } +} diff --git a/qurator/eynollah/ocrd_cli.py b/qurator/eynollah/ocrd_cli.py new file mode 100644 index 0000000..8929927 --- /dev/null +++ b/qurator/eynollah/ocrd_cli.py @@ -0,0 +1,11 @@ +from .processor import EynollahProcessor +from click import command +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor + +@command() +@ocrd_cli_options +def main(*args, **kwargs): + return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs) + +if __name__ == '__main__': + main() diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py new file mode 100644 index 0000000..07d7ab2 --- /dev/null +++ b/qurator/eynollah/processor.py @@ -0,0 +1,59 @@ +from json import loads +from pkg_resources import resource_string +from tempfile import NamedTemporaryFile +from os.path import join + +from ocrd import Processor +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import to_xml +from ocrd_utils import ( + getLogger, + MIMETYPE_PAGE, + assert_file_grp_cardinality, + make_file_id +) + +from .eynollah import Eynollah + +OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + +class EynollahProcessor(Processor): + + def __init__(self, *args, **kwargs): + kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] + kwargs['version'] = OCRD_TOOL['version'] + super().__init__(*args, **kwargs) + + def process(self): + LOG = getLogger('eynollah') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + for n, input_file in enumerate(self.input_files): + page_id = input_file.pageId or input_file.ID + LOG.info("INPUT FILE %s / %s ", page_id, len(self.input_files)) + pcgts_in = page_from_file(self.workspace.download_file(input_file)) + self.add_metadata(pcgts_in) + page = pcgts_in.get_Page() + page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + file_id = make_file_id(input_file, self.output_file_grp) + with NamedTemporaryFile(buffering=0, suffix='.tif') as f: + page_image.save(f.name) + eynollah_kwargs = { + 'dir_models': self.resolve_resource(self.parameter['models']), + 'allow_enhancement': self.parameter['allow_enhancement'], + 'curved_line': self.parameter['curved_line'], + 'full_layout': self.parameter['full_layout'], + 'allow_scaling': self.parameter['allow_scaling'], + 'headers_off': self.parameter['headers_off'], + 'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None, + 'logger': LOG, + 'image_filename': f.name} + pcgts_out = Eynollah(**eynollah_kwargs).run() + pcgts_out.get_Page().imageFilename = pcgts_in.get_Page().imageFilename + self.workspace.add_file( + ID=file_id, + file_grp=self.output_file_grp, + pageId=page_id, + mimetype=MIMETYPE_PAGE, + local_filename=join(self.output_file_grp, file_id) + '.xml', + content=to_xml(pcgts_out)) diff --git a/setup.py b/setup.py index 7988aee..c050ead 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( entry_points={ 'console_scripts': [ 'eynollah=qurator.eynollah.cli:main', - # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli', + 'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main', ] }, )