1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-06-07 19:05:13 +02:00

OCR-D processor: properly handle missing or non-downloaded GT/OCR file

Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
This commit is contained in:
Konstantin Baierer 2025-04-16 14:00:05 +02:00 committed by Mike Gerber
parent 8c1b6d65f5
commit c0aa82d188

View file

@ -27,14 +27,19 @@ class OcrdDinglehopperEvaluate(Processor):
metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"]
try:
gt_file, ocr_file = input_files
assert gt_file, 'missing GT file'
assert ocr_file, 'missing OCR file'
assert gt_file.local_filename
assert ocr_file.local_filename
except (ValueError, AssertionError) as err:
self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
# wrong number of inputs: let fail
gt_file, ocr_file = input_files
# missing on either side: skip (zip_input_files already warned)
if not gt_file or not ocr_file:
return
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
if not gt_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
return
if not ocr_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
return
page_id = gt_file.pageId