From 4aed06a325bf7d172612198ae1b5fa00ea723b0d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 19 Nov 2019 15:08:53 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20sbb=5Ftextline=5Fdetection:=20Prese?= =?UTF-8?q?rve=20input=20PAGE=20info=20by=20merging=20segmentation=20resul?= =?UTF-8?q?ts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ocrd_sbb_textline_detection used the output XML by main.py as is, and – by doing this – threw away any input data from the input PAGE, including the critical pc:AlternativeImage and the less important pc:MetadataItem. Fix this by merging the segmentation results into a file created from the input file. Also add a pc:MetadataItem processingStep about the segmentation operation. --- qurator/sbb_textline_detector/ocrd_cli.py | 49 ++++++++++++++++++++--- requirements.txt | 3 +- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/qurator/sbb_textline_detector/ocrd_cli.py b/qurator/sbb_textline_detector/ocrd_cli.py index 2a98104..728fafb 100644 --- a/qurator/sbb_textline_detector/ocrd_cli.py +++ b/qurator/sbb_textline_detector/ocrd_cli.py @@ -1,11 +1,14 @@ import json import os +import tempfile import click +import ocrd_models.ocrd_page from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_modelfactory import page_from_file from ocrd_models import OcrdFile +from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType from ocrd_utils import concat_padded, getLogger, MIMETYPE_PAGE from pkg_resources import resource_string @@ -22,10 +25,14 @@ def ocrd_sbb_textline_detector(*args, **kwargs): return ocrd_cli_wrap_processor(OcrdSbbTextlineDetectorRecognize, *args, **kwargs) +TOOL = 'ocrd_sbb_textline_detector' + + class OcrdSbbTextlineDetectorRecognize(Processor): def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd_sbb_textline_detector'] + kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] + kwargs['version'] = OCRD_TOOL['version'] super(OcrdSbbTextlineDetectorRecognize, self).__init__(*args, **kwargs) def _make_file_id(self, input_file, input_file_grp, n): @@ -49,7 +56,6 @@ class OcrdSbbTextlineDetectorRecognize(Processor): log.info("INPUT FILE %i / %s", n, input_file) file_id = self._make_file_id(input_file, self.output_file_grp, n) - image_file = self._resolve_image_file(input_file) # Process the files try: @@ -57,16 +63,47 @@ class OcrdSbbTextlineDetectorRecognize(Processor): except FileExistsError: pass - model = self.parameter['model'] - x = textlineerkenner(image_file, self.output_file_grp, file_id, model) - x.run() + with tempfile.TemporaryDirectory() as tmp_dirname: + # Segment the image + image_file = self._resolve_image_file(input_file) + model = self.parameter['model'] + x = textlineerkenner(image_file, tmp_dirname, file_id, model) + x.run() + + # Read segmentation results + tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml' + tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename) + tmp_page = tmp_pcgts.get_Page() + + # Create a new PAGE file from the input file + pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # Merge results → PAGE file + page.set_PrintSpace(tmp_page.get_PrintSpace()) + page.set_ReadingOrder(tmp_page.get_ReadingOrder()) + page.set_TextRegion(tmp_page.get_TextRegion()) + + # Save metadata about this operation + metadata = pcgts.get_Metadata() + metadata.add_MetadataItem( + MetadataItemType(type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=TOOL, + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, value=self.parameter[name]) + for name in self.parameter.keys()])])) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=page_id, mimetype='application/vnd.prima.page+xml', - local_filename=os.path.join(self.output_file_grp, file_id) + '.xml') + local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', + content=ocrd_models.ocrd_page.to_xml(pcgts) + ) if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index 3bcc5bc..42de57a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,4 @@ scikit-learn tensorflow-gpu < 2.0 scipy click -ocrd >= 1.0.0b19 - +ocrd >= 2.0.0