OCR-D processor: properly handle missing or non-downloaded GT/OCR file

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
pull/128/head
Konstantin Baierer 4 days ago committed by Mike Gerber
parent 8c1b6d65f5
commit c0aa82d188

@ -27,14 +27,19 @@ class OcrdDinglehopperEvaluate(Processor):
metrics = self.parameter["metrics"] metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"] textequiv_level = self.parameter["textequiv_level"]
try: # wrong number of inputs: let fail
gt_file, ocr_file = input_files gt_file, ocr_file = input_files
assert gt_file, 'missing GT file' # missing on either side: skip (zip_input_files already warned)
assert ocr_file, 'missing OCR file' if not gt_file or not ocr_file:
assert gt_file.local_filename return
assert ocr_file.local_filename # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
except (ValueError, AssertionError) as err: if not gt_file.local_filename:
self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
return
if not ocr_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
return return
page_id = gt_file.pageId page_id = gt_file.pageId

Loading…
Cancel
Save