🐛 dinglehopper: Skip pages if there is no GT nor OCR (Fixes GH-34)

pull/38/head
Gerber, Mike 4 years ago
parent e4e2777cb7
commit 5cbe148741

@ -34,11 +34,16 @@ class OcrdDinglehopperEvaluate(Processor):
metrics = self.parameter['metrics']
textequiv_level = self.parameter['textequiv_level']
gt_grp, ocr_grp = self.input_file_grp.split(',')
for n, page_id in enumerate(self.workspace.mets.physical_pages):
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
ocr_file = next(self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id))
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
if not gt_file or not ocr_file:
# file/page was not found in this group
continue
gt_file = self.workspace.download_file(gt_file)
ocr_file = self.workspace.download_file(ocr_file)
page_id = gt_file.pageId
log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file)
file_id = make_file_id(ocr_file, self.output_file_grp)
@ -73,6 +78,24 @@ class OcrdDinglehopperEvaluate(Processor):
# Clear cache between files
levenshtein_matrix_cache_clear()
def _zip_input_files(self, input_file_grps):
log = getLogger('processor.OcrdDinglehopperEvaluate')
input_file_tuples = list()
for page_id in ([self.page_id] if self.page_id else
self.workspace.mets.physical_pages):
ifiles = list()
for input_file_grp in input_file_grps:
log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
if not files:
log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
ifiles.append(None)
else:
ifiles.append(files[0])
if ifiles[0]:
input_file_tuples.append(tuple(ifiles))
return input_file_tuples
if __name__ == '__main__':
ocrd_dinglehopper()

Loading…
Cancel
Save