mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🐛 dinglehopper: Skip pages if there is no GT nor OCR (Fixes GH-34)
This commit is contained in:
parent
e4e2777cb7
commit
5cbe148741
1 changed files with 26 additions and 3 deletions
|
@ -34,11 +34,16 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
metrics = self.parameter['metrics']
|
||||
textequiv_level = self.parameter['textequiv_level']
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
|
||||
ocr_file = next(self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id))
|
||||
|
||||
input_file_tuples = self._zip_input_files([gt_grp, ocr_grp])
|
||||
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
|
||||
if not gt_file or not ocr_file:
|
||||
# file/page was not found in this group
|
||||
continue
|
||||
gt_file = self.workspace.download_file(gt_file)
|
||||
ocr_file = self.workspace.download_file(ocr_file)
|
||||
page_id = gt_file.pageId
|
||||
|
||||
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
|
||||
|
||||
file_id = make_file_id(ocr_file, self.output_file_grp)
|
||||
|
@ -73,6 +78,24 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
# Clear cache between files
|
||||
levenshtein_matrix_cache_clear()
|
||||
|
||||
def _zip_input_files(self, input_file_grps):
|
||||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||
input_file_tuples = list()
|
||||
for page_id in ([self.page_id] if self.page_id else
|
||||
self.workspace.mets.physical_pages):
|
||||
ifiles = list()
|
||||
for input_file_grp in input_file_grps:
|
||||
log.debug("Adding input file group %s to page %s", input_file_grp, page_id)
|
||||
files = self.workspace.mets.find_all_files(pageId=page_id, fileGrp=input_file_grp)
|
||||
if not files:
|
||||
log.error('Found no page "%s" in file group %s', page_id, input_file_grp)
|
||||
ifiles.append(None)
|
||||
else:
|
||||
ifiles.append(files[0])
|
||||
if ifiles[0]:
|
||||
input_file_tuples.append(tuple(ifiles))
|
||||
return input_file_tuples
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ocrd_dinglehopper()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue