sbb_binarization/sbb_binarize/ocrd_cli.py

# TODO: AlternativeImage 'binarized' comment should be additive

import os.path
from pkg_resources import resource_string
from json import loads

from click import command
from ocrd_utils import (
    getLogger,
    assert_file_grp_cardinality,
    make_file_id,
    MIMETYPE_PAGE
)
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor

from .sbb_binarize import SbbBinarizer

OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
TOOL = 'ocrd-sbb-binarize'

class SbbBinarizeProcessor(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super().__init__(*args, **kwargs)

    def process(self):
        """
        Binarize with sbb_binarization
        """
        LOG = getLogger('processor.SbbBinarize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']
        use_patches = self.parameter['patches']
        model_path = self.parameter['model']

        for n, input_file in enumerate(self.input_files):
            file_id = make_file_id(input_file, self.output_file_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            pcgts.set_pcGtsId(file_id)
            page = pcgts.get_Page()

            if oplevel == 'page':
                LOG.info("Binarizing on 'page' level in page '%s'", page_id)
                page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id)
                bin_image = SbbBinarizer(
                    image=page_image,
                    model=model_path,
                    patches=use_patches,
                    save=None
                ).run()
                # update METS (add the image file):
                bin_image_path = self.workspace.save_image_file(bin_image,
                        file_id + '.IMG-BIN',
                        page_id=page_id,
                        file_grp=self.output_file_grp)
                page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comment="binarized"))

            else:
                regions = page.get_AllRegions(['Text', 'Table'])
                if not regions:
                    LOG.warning("Page '%s' contains no text/table regions", page_id)

                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)

                    if oplevel == 'region':
                        region_image_bin = SbbBinarizer(
                            image=region_image,
                            model=model_path,
                            patches=use_patches,
                            save=None
                        ).run()
                        region_image_bin_path = self.workspace.save_image_file(
                                region_image_bin,
                                "%s_%s.IMG-BIN" % (file_id, region.id),
                                page_id=page_id,
                                file_grp=self.output_file_grp)
                        region.add_AlternativeImage(
                            AlternativeImageType(filename=region_image_bin_path, comments='binarized'))

                    elif oplevel == 'line':
                        lines = region.get_TextLine()
                        if not lines:
                            LOG.warning("Page '%s' region '%s' contains no text lines", page_id, region.id)
                        for line in lines:
                            line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh)
                            line_image_bin = SbbBinarizer(
                                image=line_image,
                                model=model_path,
                                patches=use_patches,
                                save=None
                            ).run()
                            line_image_bin_path = self.workspace.save_image_file(
                                    line_image_bin,
                                    "%s_%s_%s.IMG-BIN" % (file_id, region.id, line.id),
                                    page_id=page_id,
                                    file_grp=self.output_file_grp)
                            line.add_AlternativeImage(
                                AlternativeImageType(filename=line_image_bin_path, comments='binarized'))

            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts))

@command()
@ocrd_cli_options
def cli(*args, **kwargs):
    return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)
implement region/line binarization 4 years ago			`# TODO: AlternativeImage 'binarized' comment should be additive`

initial OCR-D interface 4 years ago			`import os.path`
			`from pkg_resources import resource_string`
			`from json import loads`

add OCR-D click interface 4 years ago			`from click import command`
initial OCR-D interface 4 years ago			`from ocrd_utils import (`
			`getLogger,`
			`assert_file_grp_cardinality,`
			`make_file_id,`
			`MIMETYPE_PAGE`
			`)`
add OCR-D click interface 4 years ago			`from ocrd import Processor`
initial OCR-D interface 4 years ago			`from ocrd_modelfactory import page_from_file`
implement region/line binarization 4 years ago			`from ocrd_models.ocrd_page import AlternativeImageType, to_xml`
add OCR-D click interface 4 years ago			`from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor`
initial OCR-D interface 4 years ago
			`from .sbb_binarize import SbbBinarizer`

			`OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))`
			`TOOL = 'ocrd-sbb-binarize'`

			`class SbbBinarizeProcessor(Processor):`

			`def __init__(self, args, *kwargs):`
			`kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]`
			`kwargs['version'] = OCRD_TOOL['version']`
			`super().__init__(args, *kwargs)`

			`def process(self):`
			`"""`
			`Binarize with sbb_binarization`
			`"""`
			`LOG = getLogger('processor.SbbBinarize')`
			`assert_file_grp_cardinality(self.input_file_grp, 1)`
			`assert_file_grp_cardinality(self.output_file_grp, 1)`

			`oplevel = self.parameter['operation_level']`
			`use_patches = self.parameter['patches']`
			`model_path = self.parameter['model']`

			`for n, input_file in enumerate(self.input_files):`
			`file_id = make_file_id(input_file, self.output_file_grp)`
			`page_id = input_file.pageId or input_file.ID`
			`LOG.info("INPUT FILE %i / %s", n, page_id)`
			`pcgts = page_from_file(self.workspace.download_file(input_file))`
			`self.add_metadata(pcgts)`
implement region/line binarization 4 years ago			`pcgts.set_pcGtsId(file_id)`
initial OCR-D interface 4 years ago			`page = pcgts.get_Page()`

			`if oplevel == 'page':`
			`LOG.info("Binarizing on 'page' level in page '%s'", page_id)`
			`page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id)`
implement region/line binarization 4 years ago			`bin_image = SbbBinarizer(`
			`image=page_image,`
			`model=model_path,`
			`patches=use_patches,`
			`save=None`
			`).run()`
initial OCR-D interface 4 years ago			`# update METS (add the image file):`
			`bin_image_path = self.workspace.save_image_file(bin_image,`
			`file_id + '.IMG-BIN',`
			`page_id=page_id,`
			`file_grp=self.output_file_grp)`
implement region/line binarization 4 years ago			`page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comment="binarized"))`

initial OCR-D interface 4 years ago			`else:`
implement region/line binarization 4 years ago			`regions = page.get_AllRegions(['Text', 'Table'])`
			`if not regions:`
			`LOG.warning("Page '%s' contains no text/table regions", page_id)`

			`for region in regions:`
			`region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)`

			`if oplevel == 'region':`
			`region_image_bin = SbbBinarizer(`
			`image=region_image,`
			`model=model_path,`
			`patches=use_patches,`
			`save=None`
			`).run()`
			`region_image_bin_path = self.workspace.save_image_file(`
			`region_image_bin,`
			`"%s_%s.IMG-BIN" % (file_id, region.id),`
			`page_id=page_id,`
			`file_grp=self.output_file_grp)`
			`region.add_AlternativeImage(`
			`AlternativeImageType(filename=region_image_bin_path, comments='binarized'))`

			`elif oplevel == 'line':`
			`lines = region.get_TextLine()`
			`if not lines:`
			`LOG.warning("Page '%s' region '%s' contains no text lines", page_id, region.id)`
			`for line in lines:`
			`line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh)`
			`line_image_bin = SbbBinarizer(`
			`image=line_image,`
			`model=model_path,`
			`patches=use_patches,`
			`save=None`
			`).run()`
			`line_image_bin_path = self.workspace.save_image_file(`
			`line_image_bin,`
			`"%s_%s_%s.IMG-BIN" % (file_id, region.id, line.id),`
			`page_id=page_id,`
			`file_grp=self.output_file_grp)`
			`line.add_AlternativeImage(`
			`AlternativeImageType(filename=line_image_bin_path, comments='binarized'))`
initial OCR-D interface 4 years ago
			`self.workspace.add_file(`
			`ID=file_id,`
			`file_grp=self.output_file_grp,`
			`pageId=input_file.pageId,`
			`mimetype=MIMETYPE_PAGE,`
			`local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),`
			`content=to_xml(pcgts))`
add OCR-D click interface 4 years ago
			`@command()`
			`@ocrd_cli_options`
			`def cli(args, *kwargs):`
			`return ocrd_cli_wrap_processor(SbbBinarizeProcessor, args, *kwargs)`