OCR-D CLI

2026-01-07 02:46:59 +01:00 · 2021-04-13 17:38:02 +02:00 · 2021-04-13 17:38:02 +02:00 · 9db6edf51e
commit 9db6edf51e
parent 1715f0d8b3
6 changed files with 130 additions and 4 deletions
--- a/ocrd-tool.json
+++ b/ocrd-tool.json
@ -0,0 +1 @@
+qurator/eynollah/ocrd-tool.json
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
@ -80,9 +80,9 @@ class Eynollah:
    def __init__(
        self,
        image_filename,
-        image_filename_stem,
-        dir_out,
        dir_models,
+        image_filename_stem=None,
+        dir_out=None,
        dir_of_cropped_images=None,
        dir_of_layout=None,
        dir_of_deskewed=None,
@ -94,6 +94,7 @@ class Eynollah:
        allow_scaling=False,
        headers_off=False,
        override_dpi=None,
+        logger=None,
    ):
        self.image_filename = image_filename
        self.dir_out = dir_out
@ -117,7 +118,7 @@ class Eynollah:
            dir_out=self.dir_out,
            image_filename=self.image_filename,
            curved_line=self.curved_line)
-        self.logger = getLogger('eynollah')
+        self.logger = logger if logger else getLogger('eynollah')
        self.dir_models = dir_models

        self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5"
--- a/qurator/eynollah/ocrd-tool.json
+++ b/qurator/eynollah/ocrd-tool.json
@ -0,0 +1,54 @@
+{
+  "version": "0.0.1",
+  "git_url": "https://github.com/qurator-spk/eynollah",
+  "tools": {
+    "ocrd-eynollah-segment": {
+      "executable": "ocrd-eynollah-segment",
+      "categories": ["Layout analysis"],
+      "description": "Segment page into regions and lines and do reading order detection with eynollah",
+      "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"],
+      "output_file_grp": ["OCR-D-SEG-LINE"],
+      "steps": ["layout/segmentation/region", "layout/segmentation/line"],
+      "parameters": {
+        "models": {
+          "type": "string",
+          "format": "file",
+          "cacheable": true,
+          "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)",
+          "required": true
+        },
+        "dpi": {
+          "type": "number",
+          "format": "float",
+          "description": "pixel density in dots per inch (overrides any meta-data in the images); disabled when <= 0",
+          "default": -1
+        },
+        "full_layout": {
+          "type": "boolean",
+          "default": true,
+          "description": "Try to detect all elements, including drop-caps and marginalia"
+        },
+        "curved_line": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process."
+        },
+        "allow_enhancement": {
+          "type": "boolean",
+          "default": true,
+          "description": "if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory"
+        },
+        "allow_scaling": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection"
+        },
+        "headers_off": {
+          "type": "boolean",
+          "default": false,
+          "description": "if this parameter set to true, this tool would ignore headers role in reading order"
+        }
+      }
+    }
+  }
+}
--- a/qurator/eynollah/ocrd_cli.py
+++ b/qurator/eynollah/ocrd_cli.py
@ -0,0 +1,11 @@
+from .processor import EynollahProcessor
+from click import command
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+
+@command()
+@ocrd_cli_options
+def main(*args, **kwargs):
+    return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
+
+if __name__ == '__main__':
+    main()
--- a/qurator/eynollah/processor.py
+++ b/qurator/eynollah/processor.py
@ -0,0 +1,59 @@
+from json import loads
+from pkg_resources import resource_string
+from tempfile import NamedTemporaryFile
+from os.path import join
+
+from ocrd import Processor
+from ocrd_modelfactory import page_from_file
+from ocrd_models.ocrd_page import to_xml
+from ocrd_utils import (
+    getLogger,
+    MIMETYPE_PAGE,
+    assert_file_grp_cardinality,
+    make_file_id
+)
+
+from .eynollah import Eynollah
+
+OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+
+class EynollahProcessor(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
+        kwargs['version'] = OCRD_TOOL['version']
+        super().__init__(*args, **kwargs)
+
+    def process(self):
+        LOG = getLogger('eynollah')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        for n, input_file in enumerate(self.input_files):
+            page_id = input_file.pageId or input_file.ID
+            LOG.info("INPUT FILE %s / %s ", page_id, len(self.input_files))
+            pcgts_in = page_from_file(self.workspace.download_file(input_file))
+            self.add_metadata(pcgts_in)
+            page = pcgts_in.get_Page()
+            page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+            file_id = make_file_id(input_file, self.output_file_grp)
+            with NamedTemporaryFile(buffering=0, suffix='.tif') as f:
+                page_image.save(f.name)
+                eynollah_kwargs = {
+                    'dir_models': self.resolve_resource(self.parameter['models']),
+                    'allow_enhancement': self.parameter['allow_enhancement'],
+                    'curved_line': self.parameter['curved_line'],
+                    'full_layout': self.parameter['full_layout'],
+                    'allow_scaling': self.parameter['allow_scaling'],
+                    'headers_off': self.parameter['headers_off'],
+                    'override_dpi': self.parameter['dpi'] if self.parameter['dpi'] > 0 else None,
+                    'logger': LOG,
+                    'image_filename': f.name}
+                pcgts_out = Eynollah(**eynollah_kwargs).run()
+                pcgts_out.get_Page().imageFilename = pcgts_in.get_Page().imageFilename
+                self.workspace.add_file(
+                    ID=file_id,
+                    file_grp=self.output_file_grp,
+                    pageId=page_id,
+                    mimetype=MIMETYPE_PAGE,
+                    local_filename=join(self.output_file_grp, file_id) + '.xml',
+                    content=to_xml(pcgts_out))
--- a/setup.py
+++ b/setup.py
@ -16,7 +16,7 @@ setup(
    entry_points={
        'console_scripts': [
            'eynollah=qurator.eynollah.cli:main',
-            # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli',
+            'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main',
        ]
    },
 )