From ee26ebd7d8a4bf359f303c49fe68a69b91a26721 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Oct 2020 16:55:17 +0200 Subject: [PATCH] implement region/line binarization --- sbb_binarize/ocrd-tool.json | 2 +- sbb_binarize/ocrd_cli.py | 66 +++++++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/sbb_binarize/ocrd-tool.json b/sbb_binarize/ocrd-tool.json index dafc09a..e0c4795 100644 --- a/sbb_binarize/ocrd-tool.json +++ b/sbb_binarize/ocrd-tool.json @@ -14,7 +14,7 @@ "type": "string", "enum": ["page", "region", "line"], "default": "page", - "description": "PAGE XML hierarchy level to operate on (currently only page supported" + "description": "PAGE XML hierarchy level to operate on" }, "patches": { "description": "by setting this parameter to true you let the model to see the image in patches.", diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py index 7cb8022..187269f 100644 --- a/sbb_binarize/ocrd_cli.py +++ b/sbb_binarize/ocrd_cli.py @@ -1,3 +1,5 @@ +# TODO: AlternativeImage 'binarized' comment should be additive + import os.path from pkg_resources import resource_string from json import loads @@ -9,13 +11,7 @@ from ocrd_utils import ( MIMETYPE_PAGE ) from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - MetadataItemType, - LabelsType, LabelType, - AlternativeImageType, - TextRegionType, - to_xml -) +from ocrd_models.ocrd_page import AlternativeImageType, to_xml from ocrd import Processor from .sbb_binarize import SbbBinarizer @@ -48,24 +44,68 @@ class SbbBinarizeProcessor(Processor): LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) + pcgts.set_pcGtsId(file_id) page = pcgts.get_Page() if oplevel == 'page': LOG.info("Binarizing on 'page' level in page '%s'", page_id) page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id) - binarizer = SbbBinarizer(image=page_image, model=model_path, patches=use_patches, save=None) - bin_image = binarizer.run() + bin_image = SbbBinarizer( + image=page_image, + model=model_path, + patches=use_patches, + save=None + ).run() # update METS (add the image file): bin_image_path = self.workspace.save_image_file(bin_image, file_id + '.IMG-BIN', page_id=page_id, file_grp=self.output_file_grp) - page.add_AlternativeImage(filename=bin_image_path, comment="binarized") + page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comment="binarized")) + else: - raise NotImplementedError("Binarization below page level not implemented yet") + regions = page.get_AllRegions(['Text', 'Table']) + if not regions: + LOG.warning("Page '%s' contains no text/table regions", page_id) + + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + + if oplevel == 'region': + region_image_bin = SbbBinarizer( + image=region_image, + model=model_path, + patches=use_patches, + save=None + ).run() + region_image_bin_path = self.workspace.save_image_file( + region_image_bin, + "%s_%s.IMG-BIN" % (file_id, region.id), + page_id=page_id, + file_grp=self.output_file_grp) + region.add_AlternativeImage( + AlternativeImageType(filename=region_image_bin_path, comments='binarized')) + + elif oplevel == 'line': + lines = region.get_TextLine() + if not lines: + LOG.warning("Page '%s' region '%s' contains no text lines", page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh) + line_image_bin = SbbBinarizer( + image=line_image, + model=model_path, + patches=use_patches, + save=None + ).run() + line_image_bin_path = self.workspace.save_image_file( + line_image_bin, + "%s_%s_%s.IMG-BIN" % (file_id, region.id, line.id), + page_id=page_id, + file_grp=self.output_file_grp) + line.add_AlternativeImage( + AlternativeImageType(filename=line_image_bin_path, comments='binarized')) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp,