diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index 89f04d9..1850eb1 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -34,11 +34,16 @@ class OcrdDinglehopperEvaluate(Processor): metrics = self.parameter['metrics'] textequiv_level = self.parameter['textequiv_level'] gt_grp, ocr_grp = self.input_file_grp.split(',') - for n, page_id in enumerate(self.workspace.mets.physical_pages): - gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)) - ocr_file = next(self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)) + + input_file_tuples = self._zip_input_files([gt_grp, ocr_grp]) + for n, (gt_file, ocr_file) in enumerate(input_file_tuples): + if not gt_file or not ocr_file: + # file/page was not found in this group + continue gt_file = self.workspace.download_file(gt_file) ocr_file = self.workspace.download_file(ocr_file) + page_id = gt_file.pageId + log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) file_id = make_file_id(ocr_file, self.output_file_grp) @@ -73,6 +78,24 @@ class OcrdDinglehopperEvaluate(Processor): # Clear cache between files levenshtein_matrix_cache_clear() + def _zip_input_files(self, input_file_grps): + log = getLogger('processor.OcrdDinglehopperEvaluate') + input_file_tuples = list() + for page_id in ([self.page_id] if self.page_id else + self.workspace.mets.physical_pages): + ifiles = list() + for input_file_grp in input_file_grps: + log.debug("Adding input file group %s to page %s", input_file_grp, page_id) + files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp) + if not files: + log.error('Found no page "%s" in file group %s', page_id, input_file_grp) + ifiles.append(None) + else: + ifiles.append(files[0]) + if ifiles[0]: + input_file_tuples.append(tuple(ifiles)) + return input_file_tuples + if __name__ == '__main__': ocrd_dinglehopper()