Merge pull request #33 from qurator-spk/ocrd-cli

Ocrd cli
2025-07-18 23:39:54 +02:00 · 2021-04-22 15:22:22 +02:00 · 2021-04-22 15:22:22 +02:00 · d5be8aece3
commit d5be8aece3
parent 1184d3d2fc 6c8852eb04
11 changed files with 211 additions and 59 deletions
--- a/ocrd-tool.json
+++ b/ocrd-tool.json
@ -0,0 +1 @@
+qurator/eynollah/ocrd-tool.json
--- a/qurator/eynollah/cli.py
+++ b/qurator/eynollah/cli.py
@ -117,20 +117,19 @@ def main(
        print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa or -si")
        sys.exit(1)
    eynollah = Eynollah(
-        image,
-        None,
-        out,
-        model,
-        save_images,
-        save_layout,
-        save_deskewed,
-        save_all,
-        enable_plotting,
-        allow_enhancement,
-        curved_line,
-        full_layout,
-        allow_scaling,
-        headers_off,
+        image_filename=image,
+        dir_out=out,
+        dir_models=model,
+        dir_of_cropped_images=save_images,
+        dir_of_layout=save_layout,
+        dir_of_deskewed=save_deskewed,
+        dir_of_all=save_all,
+        enable_plotting=enable_plotting,
+        allow_enhancement=allow_enhancement,
+        curved_line=curved_line,
+        full_layout=full_layout,
+        allow_scaling=allow_scaling,
+        headers_off=headers_off,
    )
    pcgts = eynollah.run()
    eynollah.writer.write_pagexml(pcgts)
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
@ -65,7 +65,7 @@ from .utils import (
    order_of_regions,
    find_number_of_columns_in_document,
    return_boxes_of_images_by_order_of_reading_new)
-from .utils.pil_cv2 import check_dpi
+from .utils.pil_cv2 import check_dpi, pil2cv
 from .utils.xml import order_and_id_of_texts
 from .plot import EynollahPlotter
 from .writer import EynollahXmlWriter
@ -79,10 +79,11 @@ KERNEL = np.ones((5, 5), np.uint8)
 class Eynollah:
    def __init__(
        self,
-        image_filename,
-        image_filename_stem,
-        dir_out,
        dir_models,
+        image_filename,
+        image_pil=None,
+        image_filename_stem=None,
+        dir_out=None,
        dir_of_cropped_images=None,
        dir_of_layout=None,
        dir_of_deskewed=None,
@ -92,30 +93,36 @@ class Eynollah:
        curved_line=False,
        full_layout=False,
        allow_scaling=False,
-        headers_off=False
+        headers_off=False,
+        override_dpi=None,
+        logger=None,
+        pcgts=None,
    ):
+        if image_pil:
+            self._imgs = self._cache_images(image_pil=image_pil)
+        else:
+            self._imgs = self._cache_images(image_filename=image_filename)
+        if override_dpi:
+            self.dpi = override_dpi
        self.image_filename = image_filename
        self.dir_out = dir_out
-        self.image_filename_stem = image_filename_stem
        self.allow_enhancement = allow_enhancement
        self.curved_line = curved_line
        self.full_layout = full_layout
        self.allow_scaling = allow_scaling
        self.headers_off = headers_off
-        if not self.image_filename_stem:
-            self.image_filename_stem = Path(Path(image_filename).name).stem
        self.plotter = None if not enable_plotting else EynollahPlotter(
            dir_of_all=dir_of_all,
            dir_of_deskewed=dir_of_deskewed,
            dir_of_cropped_images=dir_of_cropped_images,
            dir_of_layout=dir_of_layout,
-            image_filename=image_filename,
-            image_filename_stem=self.image_filename_stem)
+            image_filename_stem=Path(Path(image_filename).name).stem)
        self.writer = EynollahXmlWriter(
            dir_out=self.dir_out,
            image_filename=self.image_filename,
-            curved_line=self.curved_line)
-        self.logger = getLogger('eynollah')
+            curved_line=self.curved_line,
+            pcgts=pcgts)
+        self.logger = logger if logger else getLogger('eynollah')
        self.dir_models = dir_models

        self.model_dir_of_enhancement = dir_models + "/model_enhancement.h5"
@ -128,7 +135,18 @@ class Eynollah:
        self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5"
        self.model_textline_dir = dir_models + "/model_textline_newspapers.h5"

-        self._imgs = {}
+    def _cache_images(self, image_filename=None, image_pil=None):
+        ret = {}
+        if image_filename:
+            ret['img'] = cv2.imread(image_filename)
+            self.dpi = check_dpi(image_filename)
+        else:
+            ret['img'] = pil2cv(image_pil)
+            self.dpi = check_dpi(image_pil)
+        ret['img_grayscale'] = cv2.cvtColor(ret['img'], cv2.COLOR_BGR2GRAY)
+        for prefix in ('',  '_grayscale'):
+            ret[f'img{prefix}_uint8'] = ret[f'img{prefix}'].astype(np.uint8)
+        return ret

    def imread(self, grayscale=False, uint8=True):
        key = 'img'
@ -136,16 +154,9 @@ class Eynollah:
            key += '_grayscale'
        if uint8:
            key += '_uint8'
-        if key not in self._imgs:
-            if grayscale:
-                img = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
-            else:
-                img = cv2.imread(self.image_filename)
-            if uint8:
-                img = img.astype(np.uint8)
-            self._imgs[key] = img
        return self._imgs[key].copy()

+
    def predict_enhancement(self, img):
        self.logger.debug("enter predict_enhancement")
        model_enhancement, session_enhancement = self.start_new_session_and_model(self.model_dir_of_enhancement)
@ -346,10 +357,7 @@ class Eynollah:

    def resize_and_enhance_image_with_column_classifier(self):
        self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
-        try:
-            dpi = check_dpi(self.image_filename)
-        except:
-            dpi = 230
+        dpi = self.dpi
        self.logger.info("Detected %s DPI", dpi)
        img = self.imread()

@ -1503,7 +1511,6 @@ class Eynollah:
        scale = 1
        if is_image_enhanced:
            if self.allow_enhancement:
-                cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res)
                img_res = img_res.astype(np.uint8)
                self.get_image_and_scales(img_org, img_res, scale)
            else:
--- a/qurator/eynollah/ocrd-tool.json
+++ b/qurator/eynollah/ocrd-tool.json
@ -0,0 +1,54 @@
+{
+  "version": "0.0.1",
+  "git_url": "https://github.com/qurator-spk/eynollah",
+  "tools": {
+    "ocrd-eynollah-segment": {
+      "executable": "ocrd-eynollah-segment",
+      "categories": ["Layout analysis"],
+      "description": "Segment page into regions and lines and do reading order detection with eynollah",
+      "input_file_grp": ["OCR-D-IMG", "OCR-D-SEG-PAGE", "OCR-D-GT-SEG-PAGE"],
+      "output_file_grp": ["OCR-D-SEG-LINE"],
+      "steps": ["layout/segmentation/region", "layout/segmentation/line"],
+      "parameters": {
+        "models": {
+          "type": "string",
+          "format": "file",
+          "cacheable": true,
+          "description": "Path to directory containing models to be used (See https://qurator-data.de/eynollah)",
+          "required": true
+        },
+        "dpi": {
+          "type": "number",
+          "format": "float",
+          "description": "pixel density in dots per inch (overrides any meta-data in the images); ignored if <= 0 (with fall-back 230)",
+          "default": 0
+        },
+        "full_layout": {
+          "type": "boolean",
+          "default": true,
+          "description": "Try to detect all element subtypes, including drop-caps and headings"
+        },
+        "curved_line": {
+          "type": "boolean",
+          "default": false,
+          "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time"
+        },
+        "allow_enhancement": {
+          "type": "boolean",
+          "default": true,
+          "description": "if the input image has less than 300 DPI, then upscale and enhance"
+        },
+        "allow_scaling": {
+          "type": "boolean",
+          "default": false,
+          "description": "check the resolution against the number of detected columns and if needed, scale the image up or down during layout detection (heuristic to improve quality and performance)"
+        },
+        "headers_off": {
+          "type": "boolean",
+          "default": false,
+          "description": "ignore the special role of headings during reading order detection"
+        }
+      }
+    }
+  }
+}
--- a/qurator/eynollah/ocrd_cli.py
+++ b/qurator/eynollah/ocrd_cli.py
@ -0,0 +1,11 @@
+from .processor import EynollahProcessor
+from click import command
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+
+@command()
+@ocrd_cli_options
+def main(*args, **kwargs):
+    return ocrd_cli_wrap_processor(EynollahProcessor, *args, **kwargs)
+
+if __name__ == '__main__':
+    main()
--- a/qurator/eynollah/plot.py
+++ b/qurator/eynollah/plot.py
@ -21,7 +21,6 @@ class EynollahPlotter():
        dir_of_deskewed,
        dir_of_layout,
        dir_of_cropped_images,
-        image_filename,
        image_filename_stem,
        image_org=None,
        scale_x=1,
@ -31,7 +30,6 @@ class EynollahPlotter():
        self.dir_of_layout = dir_of_layout
        self.dir_of_cropped_images = dir_of_cropped_images
        self.dir_of_deskewed = dir_of_deskewed
-        self.image_filename = image_filename
        self.image_filename_stem = image_filename_stem
        # XXX TODO hacky these cannot be set at init time
        self.image_org = image_org
--- a/qurator/eynollah/processor.py
+++ b/qurator/eynollah/processor.py
@ -0,0 +1,65 @@
+from json import loads
+from pkg_resources import resource_string
+from tempfile import NamedTemporaryFile
+from pathlib import Path
+from os.path import join
+
+from PIL import Image
+
+from ocrd import Processor
+from ocrd_modelfactory import page_from_file, exif_from_filename
+from ocrd_models import OcrdFile, OcrdExif
+from ocrd_models.ocrd_page import to_xml
+from ocrd_utils import (
+    getLogger,
+    MIMETYPE_PAGE,
+    assert_file_grp_cardinality,
+    make_file_id
+)
+
+from .eynollah import Eynollah
+from .utils.pil_cv2 import pil2cv
+
+OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+
+class EynollahProcessor(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
+        kwargs['version'] = OCRD_TOOL['version']
+        super().__init__(*args, **kwargs)
+
+    def process(self):
+        LOG = getLogger('eynollah')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+        for n, input_file in enumerate(self.input_files):
+            page_id = input_file.pageId or input_file.ID
+            LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
+            pcgts = page_from_file(self.workspace.download_file(input_file))
+            self.add_metadata(pcgts)
+            page = pcgts.get_Page()
+            # XXX loses DPI information
+            # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
+            image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
+            eynollah_kwargs = {
+                'dir_models': self.resolve_resource(self.parameter['models']),
+                'allow_enhancement': self.parameter['allow_enhancement'],
+                'curved_line': self.parameter['curved_line'],
+                'full_layout': self.parameter['full_layout'],
+                'allow_scaling': self.parameter['allow_scaling'],
+                'headers_off': self.parameter['headers_off'],
+                'override_dpi': self.parameter['dpi'],
+                'logger': LOG,
+                'pcgts': pcgts,
+                'image_filename': image_filename
+                }
+            Eynollah(**eynollah_kwargs).run()
+            file_id = make_file_id(input_file, self.output_file_grp)
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=page_id,
+                mimetype=MIMETYPE_PAGE,
+                local_filename=join(self.output_file_grp, file_id) + '.xml',
+                content=to_xml(pcgts))
--- a/qurator/eynollah/utils/pil_cv2.py
+++ b/qurator/eynollah/utils/pil_cv2.py
@ -1,12 +1,12 @@
 from PIL import Image
 import numpy as np
 from ocrd_models import OcrdExif
-from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor, imread
+from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, COLOR_BGR2RGB, cvtColor, imread

 # from sbb_binarization

 def cv2pil(img):
-    return Image.fromarray(img.astype('uint8'))
+    return Image.fromarray(np.array(cvtColor(img, COLOR_BGR2RGB)))

 def pil2cv(img):
    # from ocrd/workspace.py
@ -14,11 +14,21 @@ def pil2cv(img):
    pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img)
    return cvtColor(pil_as_np_array, color_conversion)

-def check_dpi(image_filename):
-    exif = OcrdExif(Image.open(image_filename))
-    print(exif.to_xml())
-    resolution = exif.resolution
-    if exif.resolutionUnit == 'cm':
-        resolution /= 2.54
-    return int(resolution)
-
+def check_dpi(img):
+    try:
+        if isinstance(img, Image.__class__):
+            pil_image = img
+        elif isinstance(img, str):
+            pil_image = Image.open(img)
+        else:
+            pil_image = cv2pil(img)
+        exif = OcrdExif(pil_image)
+        resolution = exif.resolution
+        if resolution == 1:
+            raise Exception()
+        if exif.resolutionUnit == 'cm':
+            resolution /= 2.54
+        return int(resolution)
+    except Exception as e:
+        print(e)
+        return 230
--- a/qurator/eynollah/writer.py
+++ b/qurator/eynollah/writer.py
@ -28,14 +28,17 @@ class EynollahXmlWriter():
        self.counter = EynollahIdCounter()
        self.dir_out = dir_out
        self.image_filename = image_filename
-        self.image_filename_stem = Path(Path(image_filename).name).stem
        self.curved_line = curved_line
-        self.pcgts = pcgts if pcgts else PcGtsType()
+        self.pcgts = pcgts
        self.scale_x = None # XXX set outside __init__
        self.scale_y = None # XXX set outside __init__
        self.height_org = None # XXX set outside __init__
        self.width_org = None # XXX set outside __init__

+    @property
+    def image_filename_stem(self):
+        return Path(Path(self.image_filename).name).stem
+
    def calculate_page_coords(self, cont_page):
        self.logger.debug('enter calculate_page_coords')
        points_page_print = ""
@ -141,7 +144,7 @@ class EynollahXmlWriter():
        self.logger.debug('enter build_pagexml_no_full_layout')

        # create the file structure
-        pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
        page = pcgts.get_Page()
        page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))

@ -181,7 +184,7 @@ class EynollahXmlWriter():
        self.logger.debug('enter build_pagexml_full_layout')

        # create the file structure
-        pcgts = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org)
        page = pcgts.get_Page()
        page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page))))

--- a/setup.py
+++ b/setup.py
@ -13,10 +13,13 @@ setup(
    namespace_packages=['qurator'],
    packages=find_packages(exclude=['tests']),
    install_requires=install_requires,
+    package_data={
+        '': ['*.json']
+    },
    entry_points={
        'console_scripts': [
            'eynollah=qurator.eynollah.cli:main',
-            # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli',
+            'ocrd-eynollah-segment=qurator.eynollah.ocrd_cli:main',
        ]
    },
 )
--- a/tests/test_dpi.py
+++ b/tests/test_dpi.py
@ -1,10 +1,11 @@
+import cv2
 from pathlib import Path
 from qurator.eynollah.utils.pil_cv2 import check_dpi
 from tests.base import main

 def test_dpi():
-    fpath = Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif')
-    assert 300 == check_dpi(str(fpath))
+    fpath = str(Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif'))
+    assert 230 == check_dpi(cv2.imread(fpath))

 if __name__ == '__main__':
    main(__file__)