OCR-D: move sbb-binarize to ocrd-tool.json, update to v3

pull/148/head
Robert Sachunsky 2 weeks ago
parent c794d4d29f
commit a1068ff2eb

@ -32,7 +32,7 @@ plotting = ["matplotlib"]
[project.scripts] [project.scripts]
eynollah = "eynollah.cli:main" eynollah = "eynollah.cli:main"
ocrd-eynollah-segment = "eynollah.ocrd_cli:main" ocrd-eynollah-segment = "eynollah.ocrd_cli:main"
ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:cli" ocrd-sbb-binarize = "eynollah.ocrd_cli_binarization:main"
[project.urls] [project.urls]
Homepage = "https://github.com/qurator-spk/eynollah" Homepage = "https://github.com/qurator-spk/eynollah"

@ -91,6 +91,47 @@
"path_in_archive": "models_eynollah" "path_in_archive": "models_eynollah"
} }
] ]
},
"ocrd-sbb-binarize": {
"executable": "ocrd-sbb-binarize",
"description": "Pixelwise binarization with selectional auto-encoders in Keras",
"categories": ["Image preprocessing"],
"steps": ["preprocessing/optimization/binarization"],
"input_file_grp_cardinality": 1,
"output_file_grp_cardinality": 1,
"parameters": {
"operation_level": {
"type": "string",
"enum": ["page", "region"],
"default": "page",
"description": "PAGE XML hierarchy level to operate on"
},
"model": {
"description": "Directory containing HDF5 or SavedModel/ProtoBuf models. Can be an absolute path or a path relative to the OCR-D resource location, the current working directory or the $SBB_BINARIZE_DATA environment variable (if set)",
"type": "string",
"format": "uri",
"content-type": "text/directory",
"required": true
}
},
"resources": [
{
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip",
"name": "default",
"type": "archive",
"path_in_archive": "saved_model_2020_01_16",
"size": 563147331,
"description": "default models provided by github.com/qurator-spk (SavedModel format)"
},
{
"url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip",
"name": "default-2021-03-09",
"type": "archive",
"path_in_archive": ".",
"size": 133230419,
"description": "updated default models provided by github.com/qurator-spk (SavedModel format)"
}
]
} }
} }
} }

@ -1,29 +1,16 @@
from os import environ from typing import Optional
from os.path import join
from pathlib import Path
from pkg_resources import resource_string
from json import loads
from PIL import Image from PIL import Image
import numpy as np import numpy as np
import cv2 import cv2
from click import command from click import command
from ocrd_utils import ( from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
getLogger, from ocrd_models.ocrd_page import OcrdPage, AlternativeImageType
assert_file_grp_cardinality,
make_file_id,
MIMETYPE_PAGE
)
from ocrd import Processor
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from .sbb_binarize import SbbBinarizer from .sbb_binarize import SbbBinarizer
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool-binarization.json').decode('utf8'))
TOOL = 'ocrd-sbb-binarize'
def cv2pil(img): def cv2pil(img):
return Image.fromarray(img.astype('uint8')) return Image.fromarray(img.astype('uint8'))
@ -35,39 +22,22 @@ def pil2cv(img):
return cv2.cvtColor(pil_as_np_array, color_conversion) return cv2.cvtColor(pil_as_np_array, color_conversion)
class SbbBinarizeProcessor(Processor): class SbbBinarizeProcessor(Processor):
# already employs GPU (without singleton process atm)
max_workers = 1
def __init__(self, *args, **kwargs): @property
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] def executable(self):
kwargs['version'] = OCRD_TOOL['version'] return 'ocrd-sbb-binarize'
super().__init__(*args, **kwargs)
if hasattr(self, 'output_file_grp'):
# processing context
self.setup()
def setup(self): def setup(self):
""" """
Set up the model prior to processing. Set up the model prior to processing.
""" """
LOG = getLogger('processor.SbbBinarize.__init__')
if not 'model' in self.parameter:
raise ValueError("'model' parameter is required")
# resolve relative path via environment variable
model_path = Path(self.parameter['model'])
if not model_path.is_absolute():
if 'SBB_BINARIZE_DATA' in environ and environ['SBB_BINARIZE_DATA']:
LOG.info("Environment variable SBB_BINARIZE_DATA is set to '%s'" \
" - prepending to model value '%s'. If you don't want this mechanism," \
" unset the SBB_BINARIZE_DATA environment variable.",
environ['SBB_BINARIZE_DATA'], model_path)
model_path = Path(environ['SBB_BINARIZE_DATA']).joinpath(model_path)
model_path = model_path.resolve()
if not model_path.is_dir():
raise FileNotFoundError("Does not exist or is not a directory: %s" % model_path)
# resolve relative path via OCR-D ResourceManager # resolve relative path via OCR-D ResourceManager
model_path = self.resolve_resource(str(model_path)) model_path = self.resolve_resource(self.parameter['model'])
self.binarizer = SbbBinarizer(model_dir=model_path, logger=LOG) self.binarizer = SbbBinarizer(model_dir=model_path, logger=self.logger)
def process(self): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
""" """
Binarize images with sbb_binarization (based on selectional auto-encoders). Binarize images with sbb_binarization (based on selectional auto-encoders).
@ -88,71 +58,52 @@ class SbbBinarizeProcessor(Processor):
Produce a new PAGE output file by serialising the resulting hierarchy. Produce a new PAGE output file by serialising the resulting hierarchy.
""" """
LOG = getLogger('processor.SbbBinarize') assert input_pcgts
assert_file_grp_cardinality(self.input_file_grp, 1) assert input_pcgts[0]
assert_file_grp_cardinality(self.output_file_grp, 1) assert self.parameter
oplevel = self.parameter['operation_level'] oplevel = self.parameter['operation_level']
pcgts = input_pcgts[0]
for n, input_file in enumerate(self.input_files): result = OcrdPageResult(pcgts)
file_id = make_file_id(input_file, self.output_file_grp) page = pcgts.get_Page()
page_id = input_file.pageId or input_file.ID page_image, page_xywh, _ = self.workspace.image_from_page(
LOG.info("INPUT FILE %i / %s", n, page_id) page, page_id, feature_filter='binarized')
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts) if oplevel == 'page':
pcgts.set_pcGtsId(file_id) self.logger.info("Binarizing on 'page' level in page '%s'", page_id)
page = pcgts.get_Page() page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True))
page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') # update PAGE (reference the image file):
page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped')
if oplevel == 'page': page.add_AlternativeImage(page_image_ref)
LOG.info("Binarizing on 'page' level in page '%s'", page_id) result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref))
bin_image = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True))
# update METS (add the image file): elif oplevel == 'region':
bin_image_path = self.workspace.save_image_file(bin_image, regions = page.get_AllRegions(['Text', 'Table'], depth=1)
file_id + '.IMG-BIN', if not regions:
page_id=input_file.pageId, self.logger.warning("Page '%s' contains no text/table regions", page_id)
file_grp=self.output_file_grp) for region in regions:
page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments='%s,binarized' % page_xywh['features'])) region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh, feature_filter='binarized')
elif oplevel == 'region': region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True))
regions = page.get_AllRegions(['Text', 'Table'], depth=1) # update PAGE (reference the image file):
if not regions: region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized')
LOG.warning("Page '%s' contains no text/table regions", page_id) region.add_AlternativeImage(region_image_ref)
for region in regions: result.images.append(OcrdPageResultImage(region_image_bin, region.id + '.IMG-BIN', region_image_ref))
region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh, feature_filter='binarized')
region_image_bin = cv2pil(binarizer.run(image=pil2cv(region_image), use_patches=True)) elif oplevel == 'line':
region_image_bin_path = self.workspace.save_image_file( lines = page.get_AllTextLines()
region_image_bin, if not lines:
"%s_%s.IMG-BIN" % (file_id, region.id), self.logger.warning("Page '%s' contains no text lines", page_id)
page_id=input_file.pageId, for line in lines:
file_grp=self.output_file_grp) line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized')
region.add_AlternativeImage( line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True))
AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features'])) # update PAGE (reference the image file):
line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized')
elif oplevel == 'line': line.add_AlternativeImage(region_image_ref)
region_line_tuples = [(r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0)] result.images.append(OcrdPageResultImage(line_image_bin, line.id + '.IMG-BIN', line_image_ref))
if not region_line_tuples:
LOG.warning("Page '%s' contains no text lines", page_id) return result
for region_id, line in region_line_tuples:
line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized')
line_image_bin = cv2pil(binarizer.run(image=pil2cv(line_image), use_patches=True))
line_image_bin_path = self.workspace.save_image_file(
line_image_bin,
"%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id),
page_id=input_file.pageId,
file_grp=self.output_file_grp)
line.add_AlternativeImage(
AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features']))
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename=join(self.output_file_grp, file_id + '.xml'),
content=to_xml(pcgts))
@command() @command()
@ocrd_cli_options @ocrd_cli_options
def cli(*args, **kwargs): def main(*args, **kwargs):
return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs) return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)

Loading…
Cancel
Save