mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
✨ dinglehopper: Add OCR-D parameter to choose TextEquiv level
This commit is contained in:
parent
9744fa2567
commit
b23e4ce30e
4 changed files with 11 additions and 6 deletions
|
@ -148,10 +148,7 @@ class ExtractedText:
|
|||
@property
|
||||
def text(self):
|
||||
if self._text is not None:
|
||||
if self._text == '':
|
||||
return None
|
||||
else:
|
||||
return self._text
|
||||
return self._text
|
||||
else:
|
||||
return self.joiner.join(s.text for s in self.segments)
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ def page_extract(tree, *, textequiv_level='region'):
|
|||
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
|
||||
|
||||
# Filter empty region texts
|
||||
regions = [r for r in regions if r.text is not None]
|
||||
regions = [r for r in regions if r.text != '']
|
||||
|
||||
return ExtractedText(None, regions, '\n', None)
|
||||
|
||||
|
|
|
@ -22,6 +22,12 @@
|
|||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable/disable metrics and green/red"
|
||||
},
|
||||
"textequiv_level": {
|
||||
"type": "string",
|
||||
"enum": ["region", "line"],
|
||||
"default": "region",
|
||||
"description": "PAGE XML hierarchy level to extract the text from"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||
|
||||
metrics = self.parameter['metrics']
|
||||
textequiv_level = self.parameter['textequiv_level']
|
||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
|
||||
|
@ -52,7 +53,8 @@ class OcrdDinglehopperEvaluate(Processor):
|
|||
gt_file.local_filename,
|
||||
ocr_file.local_filename,
|
||||
report_prefix,
|
||||
metrics=metrics
|
||||
metrics=metrics,
|
||||
textequiv_level=textequiv_level
|
||||
)
|
||||
|
||||
# Add reports to the workspace
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue