mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-09 12:19:54 +02:00
OCR-D CLI
This commit is contained in:
parent
1715f0d8b3
commit
9db6edf51e
6 changed files with 130 additions and 4 deletions
|
@ -80,9 +80,9 @@ class Eynollah:
|
|||
def __init__(
|
||||
self,
|
||||
image_filename,
|
||||
image_filename_stem,
|
||||
dir_out,
|
||||
dir_models,
|
||||
image_filename_stem=None,
|
||||
dir_out=None,
|
||||
dir_of_cropped_images=None,
|
||||
dir_of_layout=None,
|
||||
dir_of_deskewed=None,
|
||||
|
@ -94,6 +94,7 @@ class Eynollah:
|
|||
allow_scaling=False,
|
||||
headers_off=False,
|
||||
override_dpi=None,
|
||||
logger=None,
|
||||
):
|
||||
self.image_filename = image_filename
|
||||
self.dir_out = dir_out
|
||||
|
@ -117,7 +118,7 @@ class Eynollah:
|
|||
dir_out=self.dir_out,
|
||||
image_filename=self.image_filename,
|
||||
curved_line=self.curved_line)
|
||||
self.logger = getLogger('eynollah')
|
||||
self.logger = logger if logger else getLogger('eynollah')
|
||||
self.dir_models = dir_models
|
||||
|
||||
self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5"
|
||||
|
|
54
qurator/eynollah/ocrd-tool.json
Normal file
54
qurator/eynollah/ocrd-tool.json
Normal file
|
@ -0,0 +1,54 @@
|
|||
{
|
||||
"version": "0.0.1",
|
||||
"git_url": "https://github.com/qurator-spk/eynollah",
|
||||
"tools": {
|
||||
"ocrd-eynollah-segment": {
|
||||
"executable": "ocrd-eynollah-segment",
|
||||
"categories": ["Layout analysis"],
|
||||
"description": "Segment page into regions and lines and do reading order detection with eynollah",
|
||||
"input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"],
|
||||
"output_file_grp": ["OCR-D-SEG-LINE"],
|
||||
"steps": ["layout/segmentation/region", "layout/segmentation/line"],
|
||||
"parameters": {
|
||||
"models": {
|
||||
"type": "string",
|
||||
"format": "file",
|
||||
"cacheable": true,
|
||||
"description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)",
|
||||
"required": true
|
||||
},
|
||||
"dpi": {
|
||||
"type": "number",
|
||||
"format": "float",
|
||||
"description": "pixel density in dots per inch (overrides any meta-data in the images); disabled when <= 0",
|
||||
"default": -1
|
||||
},
|
||||
"full_layout": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Try to detect all elements, including drop-caps and marginalia"
|
||||
},
|
||||
"curved_line": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process."
|
||||
},
|
||||
"allow_enhancement": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory"
|
||||
},
|
||||
"allow_scaling": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection"
|
||||
},
|
||||
"headers_off": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "if this parameter set to true, this tool would ignore headers role in reading order"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
11
qurator/eynollah/ocrd_cli.py
Normal file
11
qurator/eynollah/ocrd_cli.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from .processor import EynollahProcessor
|
||||
from click import command
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
|
||||
@command()
|
||||
@ocrd_cli_options
|
||||
def main(*args, **kwargs):
|
||||
return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
59
qurator/eynollah/processor.py
Normal file
59
qurator/eynollah/processor.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
from json import loads
|
||||
from pkg_resources import resource_string
|
||||
from tempfile import NamedTemporaryFile
|
||||
from os.path import join
|
||||
|
||||
from ocrd import Processor
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models.ocrd_page import to_xml
|
||||
from ocrd_utils import (
|
||||
getLogger,
|
||||
MIMETYPE_PAGE,
|
||||
assert_file_grp_cardinality,
|
||||
make_file_id
|
||||
)
|
||||
|
||||
from .eynollah import Eynollah
|
||||
|
||||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
|
||||
|
||||
class EynollahProcessor(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
LOG = getLogger('eynollah')
|
||||
assert_file_grp_cardinality(self.input_file_grp, 1)
|
||||
assert_file_grp_cardinality(self.output_file_grp, 1)
|
||||
for n, input_file in enumerate(self.input_files):
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
LOG.info("INPUT FILE %s / %s ", page_id, len(self.input_files))
|
||||
pcgts_in = page_from_file(self.workspace.download_file(input_file))
|
||||
self.add_metadata(pcgts_in)
|
||||
page = pcgts_in.get_Page()
|
||||
page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
|
||||
file_id = make_file_id(input_file, self.output_file_grp)
|
||||
with NamedTemporaryFile(buffering=0, suffix='.tif') as f:
|
||||
page_image.save(f.name)
|
||||
eynollah_kwargs = {
|
||||
'dir_models': self.resolve_resource(self.parameter['models']),
|
||||
'allow_enhancement': self.parameter['allow_enhancement'],
|
||||
'curved_line': self.parameter['curved_line'],
|
||||
'full_layout': self.parameter['full_layout'],
|
||||
'allow_scaling': self.parameter['allow_scaling'],
|
||||
'headers_off': self.parameter['headers_off'],
|
||||
'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None,
|
||||
'logger': LOG,
|
||||
'image_filename': f.name}
|
||||
pcgts_out = Eynollah(**eynollah_kwargs).run()
|
||||
pcgts_out.get_Page().imageFilename = pcgts_in.get_Page().imageFilename
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype=MIMETYPE_PAGE,
|
||||
local_filename=join(self.output_file_grp, file_id) + '.xml',
|
||||
content=to_xml(pcgts_out))
|
Loading…
Add table
Add a link
Reference in a new issue