From a1c8f6f4650591eda9cda8af5cb17237b6b2deb0 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 16 Oct 2020 11:53:26 +0200 Subject: [PATCH] line-level binarization independent of region-level --- sbb_binarize/ocrd_cli.py | 53 +++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py index d755b53..d846212 100644 --- a/sbb_binarize/ocrd_cli.py +++ b/sbb_binarize/ocrd_cli.py @@ -79,38 +79,35 @@ class SbbBinarizeProcessor(Processor): file_grp=self.output_file_grp) page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comment='%s,binarized' % page_xywh['features'])) - else: - regions = page.get_AllRegions(['Text', 'Table']) + elif oplevel == 'region': + regions = page.get_AllRegions(['Text', 'Table'], depth=1) if not regions: LOG.warning("Page '%s' contains no text/table regions", page_id) - for region in regions: region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized') - - if oplevel == 'region': - region_image_bin = self._run_binarizer(region_image) - region_image_bin_path = self.workspace.save_image_file( - region_image_bin, - "%s_%s.IMG-BIN" % (file_id, region.id), - page_id=input_file.pageId, - file_grp=self.output_file_grp) - region.add_AlternativeImage( - AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features'])) - - elif oplevel == 'line': - lines = region.get_TextLine() - if not lines: - LOG.warning("Page '%s' region '%s' contains no text lines", page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = self._run_binarizer(line_image) - line_image_bin_path = self.workspace.save_image_file( - line_image_bin, - "%s_%s_%s.IMG-BIN" % (file_id, region.id, line.id), - page_id=input_file.pageId, - file_grp=self.output_file_grp) - line.add_AlternativeImage( - AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features'])) + region_image_bin = self._run_binarizer(region_image) + region_image_bin_path = self.workspace.save_image_file( + region_image_bin, + "%s_%s.IMG-BIN" % (file_id, region.id), + page_id=input_file.pageId, + file_grp=self.output_file_grp) + region.add_AlternativeImage( + AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features'])) + + elif oplevel == 'line': + region_line_tuples = [(r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0)] + if not region_line_tuples: + LOG.warning("Page '%s' contains no text lines", page_id) + for region_id, line in region_line_tuples: + line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') + line_image_bin = self._run_binarizer(line_image) + line_image_bin_path = self.workspace.save_image_file( + line_image_bin, + "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), + page_id=input_file.pageId, + file_grp=self.output_file_grp) + line.add_AlternativeImage( + AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features'])) self.workspace.add_file( ID=file_id,