From c0aa82d18885402ddc0093dfc75a07e0c23a0e5b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 14:00:05 +0200 Subject: [PATCH] OCR-D processor: properly handle missing or non-downloaded GT/OCR file Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 9696ff9..52da817 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -27,14 +27,19 @@ class OcrdDinglehopperEvaluate(Processor): metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - try: - gt_file, ocr_file = input_files - assert gt_file, 'missing GT file' - assert ocr_file, 'missing OCR file' - assert gt_file.local_filename - assert ocr_file.local_filename - except (ValueError, AssertionError) as err: - self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? + # wrong number of inputs: let fail + gt_file, ocr_file = input_files + # missing on either side: skip (zip_input_files already warned) + if not gt_file or not ocr_file: + return + # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): + if not gt_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) + return + if not ocr_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) return page_id = gt_file.pageId