dinglehopper: Add OCR-D parameter to choose TextEquiv level

pull/38/head
Gerber, Mike 4 years ago
parent 9744fa2567
commit b23e4ce30e

@ -148,10 +148,7 @@ class ExtractedText:
@property
def text(self):
if self._text is not None:
if self._text == '':
return None
else:
return self._text
return self._text
else:
return self.joiner.join(s.text for s in self.segments)

@ -82,7 +82,7 @@ def page_extract(tree, *, textequiv_level='region'):
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
# Filter empty region texts
regions = [r for r in regions if r.text is not None]
regions = [r for r in regions if r.text != '']
return ExtractedText(None, regions, '\n', None)

@ -22,6 +22,12 @@
"type": "boolean",
"default": true,
"description": "Enable/disable metrics and green/red"
},
"textequiv_level": {
"type": "string",
"enum": ["region", "line"],
"default": "region",
"description": "PAGE XML hierarchy level to extract the text from"
}
}
}

@ -32,6 +32,7 @@ class OcrdDinglehopperEvaluate(Processor):
log = getLogger('processor.OcrdDinglehopperEvaluate')
metrics = self.parameter['metrics']
textequiv_level = self.parameter['textequiv_level']
gt_grp, ocr_grp = self.input_file_grp.split(',')
for n, page_id in enumerate(self.workspace.mets.physical_pages):
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
@ -52,7 +53,8 @@ class OcrdDinglehopperEvaluate(Processor):
gt_file.local_filename,
ocr_file.local_filename,
report_prefix,
metrics=metrics
metrics=metrics,
textequiv_level=textequiv_level
)
# Add reports to the workspace

Loading…
Cancel
Save