diff --git a/pyproject.toml b/pyproject.toml index 61d488a..66ba733 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ plotting = ["matplotlib"] [project.scripts] eynollah = "eynollah.cli:main" ocrd-eynollah-segment = "eynollah.ocrd_cli:main" -ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:cli" +ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main" [project.urls] Homepage = "https://github.com/qurator-spk/eynollah" diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index e7f1585..125131c 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -91,6 +91,47 @@ "path_in_archive": "models_eynollah" } ] + }, + "ocrd-sbb-binarize": { + "executable": "ocrd-sbb-binarize", + "description": "Pixelwise binarization with selectional auto-encoders in Keras", + "categories": ["Image preprocessing"], + "steps": ["preprocessing/optimization/binarization"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "parameters": { + "operation_level": { + "type": "string", + "enum": ["page", "region"], + "default": "page", + "description": "PAGE XML hierarchy level to operate on" + }, + "model": { + "description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)", + "type": "string", + "format": "uri", + "content-type": "text/directory", + "required": true + } + }, + "resources": [ + { + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", + "name": "default", + "type": "archive", + "path_in_archive": "saved_model_2020_01_16", + "size": 563147331, + "description": "default models provided by github.com/qurator-spk (SavedModel format)" + }, + { + "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", + "name": "default-2021-03-09", + "type": "archive", + "path_in_archive": ".", + "size": 133230419, + "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" + } + ] } } } diff --git a/src/eynollah/ocrd_cli_binarization.py b/src/eynollah/ocrd_cli_binarization.py index 6a8bbdc..6bc4b74 100644 --- a/src/eynollah/ocrd_cli_binarization.py +++ b/src/eynollah/ocrd_cli_binarization.py @@ -1,29 +1,16 @@ -from os import environ -from os.path import join -from pathlib import Path -from pkg_resources import resource_string -from json import loads +from typing import Optional from PIL import Image import numpy as np import cv2 from click import command -from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, - make_file_id, - MIMETYPE_PAGE -) -from ocrd import Processor -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import AlternativeImageType, to_xml +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage +from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from .sbb_binarize import SbbBinarizer -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool-binarization.json').decode('utf8')) -TOOL = 'ocrd-sbb-binarize' def cv2pil(img): return Image.fromarray(img.astype('uint8')) @@ -35,39 +22,22 @@ def pil2cv(img): return cv2.cvtColor(pil_as_np_array, color_conversion) class SbbBinarizeProcessor(Processor): + # already employs GPU (without singleton process atm) + max_workers = 1 - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + @property + def executable(self): + return 'ocrd-sbb-binarize' def setup(self): """ Set up the model prior to processing. """ - LOG = getLogger('processor.SbbBinarize.__init__') - if not 'model' in self.parameter: - raise ValueError("'model' parameter is required") - # resolve relative path via environment variable - model_path = Path(self.parameter['model']) - if not model_path.is_absolute(): - if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']: - LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s'" \ - " - prepending to model value '%s'. If you don't want this mechanism," \ - " unset the SBB_BINARIZE_DATA environment variable.", - environ['SBB_BINARIZE_DATA'], model_path) - model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path) - model_path = model_path.resolve() - if not model_path.is_dir(): - raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path) # resolve relative path via OCR-D ResourceManager - model_path = self.resolve_resource(str(model_path)) - self.binarizer = SbbBinarizer(model_dir=model_path, logger=LOG) + model_path = self.resolve_resource(self.parameter['model']) + self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger) - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Binarize images with sbb_binarization (based on selectional auto-encoders). @@ -88,71 +58,52 @@ class SbbBinarizeProcessor(Processor): Produce a new PAGE output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.SbbBinarize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - + assert input_pcgts + assert input_pcgts[0] + assert self.parameter oplevel = self.parameter['operation_level'] - - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - pcgts.set_pcGtsId(file_id) - page = pcgts.get_Page() - page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - - if oplevel == 'page': - LOG.info("Binarizing on 'page' level in page '%s'", page_id) - bin_image = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) - # update METS (add the image file): - bin_image_path = self.workspace.save_image_file(bin_image, - file_id + '.IMG-BIN', - page_id=input_file.pageId, - file_grp=self.output_file_grp) - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments='%s,binarized' % page_xywh['features'])) - - elif oplevel == 'region': - regions = page.get_AllRegions(['Text', 'Table'], depth=1) - if not regions: - LOG.warning("Page '%s' contains no text/table regions", page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True)) - region_image_bin_path = self.workspace.save_image_file( - region_image_bin, - "%s_%s.IMG-BIN" % (file_id, region.id), - page_id=input_file.pageId, - file_grp=self.output_file_grp) - region.add_AlternativeImage( - AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features'])) - - elif oplevel == 'line': - region_line_tuples = [(r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0)] - if not region_line_tuples: - LOG.warning("Page '%s' contains no text lines", page_id) - for region_id, line in region_line_tuples: - line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True)) - line_image_bin_path = self.workspace.save_image_file( - line_image_bin, - "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), - page_id=input_file.pageId, - file_grp=self.output_file_grp) - line.add_AlternativeImage( - AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features'])) - - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, file_id + '.xml'), - content=to_xml(pcgts)) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + page_image, page_xywh, _ = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') + + if oplevel == 'page': + self.logger.info("Binarizing on 'page' level in page '%s'", page_id) + page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) + # update PAGE (reference the image file): + page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped') + page.add_AlternativeImage(page_image_ref) + result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref)) + + elif oplevel == 'region': + regions = page.get_AllRegions(['Text', 'Table'], depth=1) + if not regions: + self.logger.warning("Page '%s' contains no text/table regions", page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True)) + # update PAGE (reference the image file): + region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized') + region.add_AlternativeImage(region_image_ref) + result.images.append(OcrdPageResultImage(region_image_bin, region.id + '.IMG-BIN', region_image_ref)) + + elif oplevel == 'line': + lines = page.get_AllTextLines() + if not lines: + self.logger.warning("Page '%s' contains no text lines", page_id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') + line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True)) + # update PAGE (reference the image file): + line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized') + line.add_AlternativeImage(region_image_ref) + result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref)) + + return result @command() @ocrd_cli_options -def cli(*args, **kwargs): +def main(*args, **kwargs): return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)