mirror of
https://github.com/qurator-spk/sbb_textline_detection.git
synced 2025-06-09 03:40:18 +02:00
✨ sbb_textline_detection: Preserve input PAGE info by merging segmentation results
ocrd_sbb_textline_detection used the output XML by main.py as is, and – by doing this – threw away any input data from the input PAGE, including the critical pc:AlternativeImage and the less important pc:MetadataItem. Fix this by merging the segmentation results into a file created from the input file. Also add a pc:MetadataItem processingStep about the segmentation operation.
This commit is contained in:
parent
4fb3e70ef6
commit
4aed06a325
2 changed files with 44 additions and 8 deletions
|
@ -1,11 +1,14 @@
|
|||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import click
|
||||
import ocrd_models.ocrd_page
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_modelfactory import page_from_file
|
||||
from ocrd_models import OcrdFile
|
||||
from ocrd_models.ocrd_page_generateds import MetadataItemType, LabelsType, LabelType
|
||||
from ocrd_utils import concat_padded, getLogger, MIMETYPE_PAGE
|
||||
from pkg_resources import resource_string
|
||||
|
||||
|
@ -22,10 +25,14 @@ def ocrd_sbb_textline_detector(*args, **kwargs):
|
|||
return ocrd_cli_wrap_processor(OcrdSbbTextlineDetectorRecognize, *args, **kwargs)
|
||||
|
||||
|
||||
TOOL = 'ocrd_sbb_textline_detector'
|
||||
|
||||
|
||||
class OcrdSbbTextlineDetectorRecognize(Processor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd_sbb_textline_detector']
|
||||
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
||||
kwargs['version'] = OCRD_TOOL['version']
|
||||
super(OcrdSbbTextlineDetectorRecognize, self).__init__(*args, **kwargs)
|
||||
|
||||
def _make_file_id(self, input_file, input_file_grp, n):
|
||||
|
@ -49,7 +56,6 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
|
|||
log.info("INPUT FILE %i / %s", n, input_file)
|
||||
|
||||
file_id = self._make_file_id(input_file, self.output_file_grp, n)
|
||||
image_file = self._resolve_image_file(input_file)
|
||||
|
||||
# Process the files
|
||||
try:
|
||||
|
@ -57,16 +63,47 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
|
|||
except FileExistsError:
|
||||
pass
|
||||
|
||||
model = self.parameter['model']
|
||||
x = textlineerkenner(image_file, self.output_file_grp, file_id, model)
|
||||
x.run()
|
||||
with tempfile.TemporaryDirectory() as tmp_dirname:
|
||||
# Segment the image
|
||||
image_file = self._resolve_image_file(input_file)
|
||||
model = self.parameter['model']
|
||||
x = textlineerkenner(image_file, tmp_dirname, file_id, model)
|
||||
x.run()
|
||||
|
||||
# Read segmentation results
|
||||
tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml'
|
||||
tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename)
|
||||
tmp_page = tmp_pcgts.get_Page()
|
||||
|
||||
# Create a new PAGE file from the input file
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
page = pcgts.get_Page()
|
||||
|
||||
# Merge results → PAGE file
|
||||
page.set_PrintSpace(tmp_page.get_PrintSpace())
|
||||
page.set_ReadingOrder(tmp_page.get_ReadingOrder())
|
||||
page.set_TextRegion(tmp_page.get_TextRegion())
|
||||
|
||||
# Save metadata about this operation
|
||||
metadata = pcgts.get_Metadata()
|
||||
metadata.add_MetadataItem(
|
||||
MetadataItemType(type_="processingStep",
|
||||
name=self.ocrd_tool['steps'][0],
|
||||
value=TOOL,
|
||||
Labels=[LabelsType(
|
||||
externalModel="ocrd-tool",
|
||||
externalId="parameters",
|
||||
Label=[LabelType(type_=name, value=self.parameter[name])
|
||||
for name in self.parameter.keys()])]))
|
||||
|
||||
self.workspace.add_file(
|
||||
ID=file_id,
|
||||
file_grp=self.output_file_grp,
|
||||
pageId=page_id,
|
||||
mimetype='application/vnd.prima.page+xml',
|
||||
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml')
|
||||
local_filename=os.path.join(self.output_file_grp, file_id) + '.xml',
|
||||
content=ocrd_models.ocrd_page.to_xml(pcgts)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue