mirror of
				https://github.com/qurator-spk/eynollah.git
				synced 2025-11-03 19:24:13 +01:00 
			
		
		
		
	OCR-D CLI
This commit is contained in:
		
							parent
							
								
									1715f0d8b3
								
							
						
					
					
						commit
						9db6edf51e
					
				
					 6 changed files with 130 additions and 4 deletions
				
			
		
							
								
								
									
										1
									
								
								ocrd-tool.json
									
										
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								ocrd-tool.json
									
										
									
									
									
										Symbolic link
									
								
							| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					qurator/eynollah/ocrd-tool.json
 | 
				
			||||||
| 
						 | 
					@ -80,9 +80,9 @@ class Eynollah:
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        image_filename,
 | 
					        image_filename,
 | 
				
			||||||
        image_filename_stem,
 | 
					 | 
				
			||||||
        dir_out,
 | 
					 | 
				
			||||||
        dir_models,
 | 
					        dir_models,
 | 
				
			||||||
 | 
					        image_filename_stem=None,
 | 
				
			||||||
 | 
					        dir_out=None,
 | 
				
			||||||
        dir_of_cropped_images=None,
 | 
					        dir_of_cropped_images=None,
 | 
				
			||||||
        dir_of_layout=None,
 | 
					        dir_of_layout=None,
 | 
				
			||||||
        dir_of_deskewed=None,
 | 
					        dir_of_deskewed=None,
 | 
				
			||||||
| 
						 | 
					@ -94,6 +94,7 @@ class Eynollah:
 | 
				
			||||||
        allow_scaling=False,
 | 
					        allow_scaling=False,
 | 
				
			||||||
        headers_off=False,
 | 
					        headers_off=False,
 | 
				
			||||||
        override_dpi=None,
 | 
					        override_dpi=None,
 | 
				
			||||||
 | 
					        logger=None,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        self.image_filename = image_filename
 | 
					        self.image_filename = image_filename
 | 
				
			||||||
        self.dir_out = dir_out
 | 
					        self.dir_out = dir_out
 | 
				
			||||||
| 
						 | 
					@ -117,7 +118,7 @@ class Eynollah:
 | 
				
			||||||
            dir_out=self.dir_out,
 | 
					            dir_out=self.dir_out,
 | 
				
			||||||
            image_filename=self.image_filename,
 | 
					            image_filename=self.image_filename,
 | 
				
			||||||
            curved_line=self.curved_line)
 | 
					            curved_line=self.curved_line)
 | 
				
			||||||
        self.logger = getLogger('eynollah')
 | 
					        self.logger = logger if logger else getLogger('eynollah')
 | 
				
			||||||
        self.dir_models = dir_models
 | 
					        self.dir_models = dir_models
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5"
 | 
					        self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										54
									
								
								qurator/eynollah/ocrd-tool.json
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								qurator/eynollah/ocrd-tool.json
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,54 @@
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  "version": "0.0.1",
 | 
				
			||||||
 | 
					  "git_url": "https://github.com/qurator-spk/eynollah",
 | 
				
			||||||
 | 
					  "tools": {
 | 
				
			||||||
 | 
					    "ocrd-eynollah-segment": {
 | 
				
			||||||
 | 
					      "executable": "ocrd-eynollah-segment",
 | 
				
			||||||
 | 
					      "categories": ["Layout analysis"],
 | 
				
			||||||
 | 
					      "description": "Segment page into regions and lines and do reading order detection with eynollah",
 | 
				
			||||||
 | 
					      "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"],
 | 
				
			||||||
 | 
					      "output_file_grp": ["OCR-D-SEG-LINE"],
 | 
				
			||||||
 | 
					      "steps": ["layout/segmentation/region", "layout/segmentation/line"],
 | 
				
			||||||
 | 
					      "parameters": {
 | 
				
			||||||
 | 
					        "models": {
 | 
				
			||||||
 | 
					          "type": "string",
 | 
				
			||||||
 | 
					          "format": "file",
 | 
				
			||||||
 | 
					          "cacheable": true,
 | 
				
			||||||
 | 
					          "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)",
 | 
				
			||||||
 | 
					          "required": true
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        "dpi": {
 | 
				
			||||||
 | 
					          "type": "number",
 | 
				
			||||||
 | 
					          "format": "float",
 | 
				
			||||||
 | 
					          "description": "pixel density in dots per inch (overrides any meta-data in the images); disabled when <= 0",
 | 
				
			||||||
 | 
					          "default": -1
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        "full_layout": {
 | 
				
			||||||
 | 
					          "type": "boolean",
 | 
				
			||||||
 | 
					          "default": true,
 | 
				
			||||||
 | 
					          "description": "Try to detect all elements, including drop-caps and marginalia"
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        "curved_line": {
 | 
				
			||||||
 | 
					          "type": "boolean",
 | 
				
			||||||
 | 
					          "default": false,
 | 
				
			||||||
 | 
					          "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process."
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        "allow_enhancement": {
 | 
				
			||||||
 | 
					          "type": "boolean",
 | 
				
			||||||
 | 
					          "default": true,
 | 
				
			||||||
 | 
					          "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory"
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        "allow_scaling": {
 | 
				
			||||||
 | 
					          "type": "boolean",
 | 
				
			||||||
 | 
					          "default": false,
 | 
				
			||||||
 | 
					          "description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection"
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        "headers_off": {
 | 
				
			||||||
 | 
					          "type": "boolean",
 | 
				
			||||||
 | 
					          "default": false,
 | 
				
			||||||
 | 
					          "description": "if this parameter set to true, this tool would ignore headers role in reading order"
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										11
									
								
								qurator/eynollah/ocrd_cli.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								qurator/eynollah/ocrd_cli.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,11 @@
 | 
				
			||||||
 | 
					from .processor import EynollahProcessor
 | 
				
			||||||
 | 
					from click import command
 | 
				
			||||||
 | 
					from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@command()
 | 
				
			||||||
 | 
					@ocrd_cli_options
 | 
				
			||||||
 | 
					def main(*args, **kwargs):
 | 
				
			||||||
 | 
					    return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    main()
 | 
				
			||||||
							
								
								
									
										59
									
								
								qurator/eynollah/processor.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								qurator/eynollah/processor.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,59 @@
 | 
				
			||||||
 | 
					from json import loads
 | 
				
			||||||
 | 
					from pkg_resources import resource_string
 | 
				
			||||||
 | 
					from tempfile import NamedTemporaryFile
 | 
				
			||||||
 | 
					from os.path import join
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ocrd import Processor
 | 
				
			||||||
 | 
					from ocrd_modelfactory import page_from_file
 | 
				
			||||||
 | 
					from ocrd_models.ocrd_page import to_xml
 | 
				
			||||||
 | 
					from ocrd_utils import (
 | 
				
			||||||
 | 
					    getLogger,
 | 
				
			||||||
 | 
					    MIMETYPE_PAGE,
 | 
				
			||||||
 | 
					    assert_file_grp_cardinality,
 | 
				
			||||||
 | 
					    make_file_id
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .eynollah import Eynollah
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class EynollahProcessor(Processor):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
 | 
					        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
 | 
				
			||||||
 | 
					        kwargs['version'] = OCRD_TOOL['version']
 | 
				
			||||||
 | 
					        super().__init__(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def process(self):
 | 
				
			||||||
 | 
					        LOG = getLogger('eynollah')
 | 
				
			||||||
 | 
					        assert_file_grp_cardinality(self.input_file_grp, 1)
 | 
				
			||||||
 | 
					        assert_file_grp_cardinality(self.output_file_grp, 1)
 | 
				
			||||||
 | 
					        for n, input_file in enumerate(self.input_files):
 | 
				
			||||||
 | 
					            page_id = input_file.pageId or input_file.ID
 | 
				
			||||||
 | 
					            LOG.info("INPUT FILE %s / %s ", page_id, len(self.input_files))
 | 
				
			||||||
 | 
					            pcgts_in = page_from_file(self.workspace.download_file(input_file))
 | 
				
			||||||
 | 
					            self.add_metadata(pcgts_in)
 | 
				
			||||||
 | 
					            page = pcgts_in.get_Page()
 | 
				
			||||||
 | 
					            page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
 | 
				
			||||||
 | 
					            file_id = make_file_id(input_file, self.output_file_grp)
 | 
				
			||||||
 | 
					            with NamedTemporaryFile(buffering=0, suffix='.tif') as f:
 | 
				
			||||||
 | 
					                page_image.save(f.name)
 | 
				
			||||||
 | 
					                eynollah_kwargs = {
 | 
				
			||||||
 | 
					                    'dir_models': self.resolve_resource(self.parameter['models']),
 | 
				
			||||||
 | 
					                    'allow_enhancement': self.parameter['allow_enhancement'],
 | 
				
			||||||
 | 
					                    'curved_line': self.parameter['curved_line'],
 | 
				
			||||||
 | 
					                    'full_layout': self.parameter['full_layout'],
 | 
				
			||||||
 | 
					                    'allow_scaling': self.parameter['allow_scaling'],
 | 
				
			||||||
 | 
					                    'headers_off': self.parameter['headers_off'],
 | 
				
			||||||
 | 
					                    'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None,
 | 
				
			||||||
 | 
					                    'logger': LOG,
 | 
				
			||||||
 | 
					                    'image_filename': f.name}
 | 
				
			||||||
 | 
					                pcgts_out = Eynollah(**eynollah_kwargs).run()
 | 
				
			||||||
 | 
					                pcgts_out.get_Page().imageFilename = pcgts_in.get_Page().imageFilename
 | 
				
			||||||
 | 
					                self.workspace.add_file(
 | 
				
			||||||
 | 
					                    ID=file_id,
 | 
				
			||||||
 | 
					                    file_grp=self.output_file_grp,
 | 
				
			||||||
 | 
					                    pageId=page_id,
 | 
				
			||||||
 | 
					                    mimetype=MIMETYPE_PAGE,
 | 
				
			||||||
 | 
					                    local_filename=join(self.output_file_grp, file_id) + '.xml',
 | 
				
			||||||
 | 
					                    content=to_xml(pcgts_out))
 | 
				
			||||||
							
								
								
									
										2
									
								
								setup.py
									
										
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
										
									
									
									
								
							| 
						 | 
					@ -16,7 +16,7 @@ setup(
 | 
				
			||||||
    entry_points={
 | 
					    entry_points={
 | 
				
			||||||
        'console_scripts': [
 | 
					        'console_scripts': [
 | 
				
			||||||
            'eynollah=qurator.eynollah.cli:main',
 | 
					            'eynollah=qurator.eynollah.cli:main',
 | 
				
			||||||
            # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli',
 | 
					            'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main',
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue