|
|
@ -34,11 +34,16 @@ class OcrdDinglehopperEvaluate(Processor):
|
|
|
|
metrics = self.parameter['metrics']
|
|
|
|
metrics = self.parameter['metrics']
|
|
|
|
textequiv_level = self.parameter['textequiv_level']
|
|
|
|
textequiv_level = self.parameter['textequiv_level']
|
|
|
|
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
|
|
|
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
|
|
|
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
|
|
|
|
|
|
|
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
|
|
|
|
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
|
|
|
|
ocr_file = next(self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id))
|
|
|
|
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
|
|
|
|
|
|
|
|
if not gt_file or not ocr_file:
|
|
|
|
|
|
|
|
# file/page was not found in this group
|
|
|
|
|
|
|
|
continue
|
|
|
|
gt_file = self.workspace.download_file(gt_file)
|
|
|
|
gt_file = self.workspace.download_file(gt_file)
|
|
|
|
ocr_file = self.workspace.download_file(ocr_file)
|
|
|
|
ocr_file = self.workspace.download_file(ocr_file)
|
|
|
|
|
|
|
|
page_id = gt_file.pageId
|
|
|
|
|
|
|
|
|
|
|
|
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
|
|
|
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
|
|
|
|
|
|
|
|
|
|
|
file_id = make_file_id(ocr_file, self.output_file_grp)
|
|
|
|
file_id = make_file_id(ocr_file, self.output_file_grp)
|
|
|
@ -73,6 +78,24 @@ class OcrdDinglehopperEvaluate(Processor):
|
|
|
|
# Clear cache between files
|
|
|
|
# Clear cache between files
|
|
|
|
levenshtein_matrix_cache_clear()
|
|
|
|
levenshtein_matrix_cache_clear()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _zip_input_files(self, input_file_grps):
|
|
|
|
|
|
|
|
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
|
|
|
|
|
|
|
input_file_tuples = list()
|
|
|
|
|
|
|
|
for page_id in ([self.page_id] if self.page_id else
|
|
|
|
|
|
|
|
self.workspace.mets.physical_pages):
|
|
|
|
|
|
|
|
ifiles = list()
|
|
|
|
|
|
|
|
for input_file_grp in input_file_grps:
|
|
|
|
|
|
|
|
log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
|
|
|
|
|
|
|
|
files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
|
|
|
|
|
|
|
|
if not files:
|
|
|
|
|
|
|
|
log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
|
|
|
|
|
|
|
|
ifiles.append(None)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
ifiles.append(files[0])
|
|
|
|
|
|
|
|
if ifiles[0]:
|
|
|
|
|
|
|
|
input_file_tuples.append(tuple(ifiles))
|
|
|
|
|
|
|
|
return input_file_tuples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|
ocrd_dinglehopper()
|
|
|
|
ocrd_dinglehopper()
|
|
|
|