@ -1,11 +1,14 @@
import json
import json
import os
import os
import tempfile
import click
import click
import ocrd_models . ocrd_page
from ocrd import Processor
from ocrd import Processor
from ocrd . decorators import ocrd_cli_options , ocrd_cli_wrap_processor
from ocrd . decorators import ocrd_cli_options , ocrd_cli_wrap_processor
from ocrd_modelfactory import page_from_file
from ocrd_modelfactory import page_from_file
from ocrd_models import OcrdFile
from ocrd_models import OcrdFile
from ocrd_models . ocrd_page_generateds import MetadataItemType , LabelsType , LabelType
from ocrd_utils import concat_padded , getLogger , MIMETYPE_PAGE
from ocrd_utils import concat_padded , getLogger , MIMETYPE_PAGE
from pkg_resources import resource_string
from pkg_resources import resource_string
@ -22,10 +25,14 @@ def ocrd_sbb_textline_detector(*args, **kwargs):
return ocrd_cli_wrap_processor ( OcrdSbbTextlineDetectorRecognize , * args , * * kwargs )
return ocrd_cli_wrap_processor ( OcrdSbbTextlineDetectorRecognize , * args , * * kwargs )
TOOL = ' ocrd_sbb_textline_detector '
class OcrdSbbTextlineDetectorRecognize ( Processor ) :
class OcrdSbbTextlineDetectorRecognize ( Processor ) :
def __init__ ( self , * args , * * kwargs ) :
def __init__ ( self , * args , * * kwargs ) :
kwargs [ ' ocrd_tool ' ] = OCRD_TOOL [ ' tools ' ] [ ' ocrd_sbb_textline_detector ' ]
kwargs [ ' ocrd_tool ' ] = OCRD_TOOL [ ' tools ' ] [ TOOL ]
kwargs [ ' version ' ] = OCRD_TOOL [ ' version ' ]
super ( OcrdSbbTextlineDetectorRecognize , self ) . __init__ ( * args , * * kwargs )
super ( OcrdSbbTextlineDetectorRecognize , self ) . __init__ ( * args , * * kwargs )
def _make_file_id ( self , input_file , input_file_grp , n ) :
def _make_file_id ( self , input_file , input_file_grp , n ) :
@ -49,7 +56,6 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
log . info ( " INPUT FILE %i / %s " , n , input_file )
log . info ( " INPUT FILE %i / %s " , n , input_file )
file_id = self . _make_file_id ( input_file , self . output_file_grp , n )
file_id = self . _make_file_id ( input_file , self . output_file_grp , n )
image_file = self . _resolve_image_file ( input_file )
# Process the files
# Process the files
try :
try :
@ -57,16 +63,47 @@ class OcrdSbbTextlineDetectorRecognize(Processor):
except FileExistsError :
except FileExistsError :
pass
pass
model = self . parameter [ ' model ' ]
with tempfile . TemporaryDirectory ( ) as tmp_dirname :
x = textlineerkenner ( image_file , self . output_file_grp , file_id , model )
# Segment the image
x . run ( )
image_file = self . _resolve_image_file ( input_file )
model = self . parameter [ ' model ' ]
x = textlineerkenner ( image_file , tmp_dirname , file_id , model )
x . run ( )
# Read segmentation results
tmp_filename = os . path . join ( tmp_dirname , file_id ) + ' .xml '
tmp_pcgts = ocrd_models . ocrd_page . parse ( tmp_filename )
tmp_page = tmp_pcgts . get_Page ( )
# Create a new PAGE file from the input file
pcgts = page_from_file ( self . workspace . download_file ( input_file ) )
page = pcgts . get_Page ( )
# Merge results → PAGE file
page . set_PrintSpace ( tmp_page . get_PrintSpace ( ) )
page . set_ReadingOrder ( tmp_page . get_ReadingOrder ( ) )
page . set_TextRegion ( tmp_page . get_TextRegion ( ) )
# Save metadata about this operation
metadata = pcgts . get_Metadata ( )
metadata . add_MetadataItem (
MetadataItemType ( type_ = " processingStep " ,
name = self . ocrd_tool [ ' steps ' ] [ 0 ] ,
value = TOOL ,
Labels = [ LabelsType (
externalModel = " ocrd-tool " ,
externalId = " parameters " ,
Label = [ LabelType ( type_ = name , value = self . parameter [ name ] )
for name in self . parameter . keys ( ) ] ) ] ) )
self . workspace . add_file (
self . workspace . add_file (
ID = file_id ,
ID = file_id ,
file_grp = self . output_file_grp ,
file_grp = self . output_file_grp ,
pageId = page_id ,
pageId = page_id ,
mimetype = ' application/vnd.prima.page+xml ' ,
mimetype = ' application/vnd.prima.page+xml ' ,
local_filename = os . path . join ( self . output_file_grp , file_id ) + ' .xml ' )
local_filename = os . path . join ( self . output_file_grp , file_id ) + ' .xml ' ,
content = ocrd_models . ocrd_page . to_xml ( pcgts )
)
if __name__ == ' __main__ ' :
if __name__ == ' __main__ ' :