mirror of
				https://github.com/qurator-spk/eynollah.git
				synced 2025-10-24 23:34:16 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			68 lines
		
	
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			68 lines
		
	
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from json import loads
 | |
| from pkg_resources import resource_string
 | |
| from tempfile import NamedTemporaryFile
 | |
| from pathlib import Path
 | |
| from os.path import join
 | |
| 
 | |
| from PIL import Image
 | |
| 
 | |
| from ocrd import Processor
 | |
| from ocrd_modelfactory import page_from_file, exif_from_filename
 | |
| from ocrd_models import OcrdFile, OcrdExif
 | |
| from ocrd_models.ocrd_page import to_xml
 | |
| from ocrd_utils import (
 | |
|     getLogger,
 | |
|     MIMETYPE_PAGE,
 | |
|     assert_file_grp_cardinality,
 | |
|     make_file_id
 | |
| )
 | |
| 
 | |
| from .eynollah import Eynollah
 | |
| from .utils.pil_cv2 import pil2cv
 | |
| 
 | |
| OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
 | |
| 
 | |
| class EynollahProcessor(Processor):
 | |
| 
 | |
|     def __init__(self, *args, **kwargs):
 | |
|         kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment']
 | |
|         kwargs['version'] = OCRD_TOOL['version']
 | |
|         super().__init__(*args, **kwargs)
 | |
| 
 | |
|     def process(self):
 | |
|         LOG = getLogger('eynollah')
 | |
|         assert_file_grp_cardinality(self.input_file_grp, 1)
 | |
|         assert_file_grp_cardinality(self.output_file_grp, 1)
 | |
|         for n, input_file in enumerate(self.input_files):
 | |
|             page_id = input_file.pageId or input_file.ID
 | |
|             LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
 | |
|             pcgts = page_from_file(self.workspace.download_file(input_file))
 | |
|             LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
 | |
|             self.add_metadata(pcgts)
 | |
|             page = pcgts.get_Page()
 | |
|             # XXX loses DPI information
 | |
|             # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
 | |
|             image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
 | |
|             eynollah_kwargs = {
 | |
|                 'dir_models': self.resolve_resource(self.parameter['models']),
 | |
|                 'allow_enhancement': False,
 | |
|                 'curved_line': self.parameter['curved_line'],
 | |
|                 'full_layout': self.parameter['full_layout'],
 | |
|                 'allow_scaling': self.parameter['allow_scaling'],
 | |
|                 'headers_off': self.parameter['headers_off'],
 | |
|                 'tables': self.parameter['tables'],
 | |
|                 'override_dpi': self.parameter['dpi'],
 | |
|                 'logger': LOG,
 | |
|                 'pcgts': pcgts,
 | |
|                 'image_filename': image_filename
 | |
|                 }
 | |
|             Eynollah(**eynollah_kwargs).run()
 | |
|             file_id = make_file_id(input_file, self.output_file_grp)
 | |
|             pcgts.set_pcGtsId(file_id)
 | |
|             self.workspace.add_file(
 | |
|                 ID=file_id,
 | |
|                 file_grp=self.output_file_grp,
 | |
|                 pageId=page_id,
 | |
|                 mimetype=MIMETYPE_PAGE,
 | |
|                 local_filename=join(self.output_file_grp, file_id) + '.xml',
 | |
|                 content=to_xml(pcgts))
 |