mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 06:29:59 +02:00
✨ dinglehopper: Add OCR-D parameter to choose TextEquiv level
This commit is contained in:
parent
9744fa2567
commit
b23e4ce30e
4 changed files with 11 additions and 6 deletions
|
@ -148,10 +148,7 @@ class ExtractedText:
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
if self._text is not None:
|
if self._text is not None:
|
||||||
if self._text == '':
|
return self._text
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return self._text
|
|
||||||
else:
|
else:
|
||||||
return self.joiner.join(s.text for s in self.segments)
|
return self.joiner.join(s.text for s in self.segments)
|
||||||
|
|
||||||
|
|
|
@ -82,7 +82,7 @@ def page_extract(tree, *, textequiv_level='region'):
|
||||||
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
|
regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level))
|
||||||
|
|
||||||
# Filter empty region texts
|
# Filter empty region texts
|
||||||
regions = [r for r in regions if r.text is not None]
|
regions = [r for r in regions if r.text != '']
|
||||||
|
|
||||||
return ExtractedText(None, regions, '\n', None)
|
return ExtractedText(None, regions, '\n', None)
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,12 @@
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"default": true,
|
"default": true,
|
||||||
"description": "Enable/disable metrics and green/red"
|
"description": "Enable/disable metrics and green/red"
|
||||||
|
},
|
||||||
|
"textequiv_level": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["region", "line"],
|
||||||
|
"default": "region",
|
||||||
|
"description": "PAGE XML hierarchy level to extract the text from"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,6 +32,7 @@ class OcrdDinglehopperEvaluate(Processor):
|
||||||
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
log = getLogger('processor.OcrdDinglehopperEvaluate')
|
||||||
|
|
||||||
metrics = self.parameter['metrics']
|
metrics = self.parameter['metrics']
|
||||||
|
textequiv_level = self.parameter['textequiv_level']
|
||||||
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
gt_grp, ocr_grp = self.input_file_grp.split(',')
|
||||||
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
for n, page_id in enumerate(self.workspace.mets.physical_pages):
|
||||||
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
|
gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id))
|
||||||
|
@ -52,7 +53,8 @@ class OcrdDinglehopperEvaluate(Processor):
|
||||||
gt_file.local_filename,
|
gt_file.local_filename,
|
||||||
ocr_file.local_filename,
|
ocr_file.local_filename,
|
||||||
report_prefix,
|
report_prefix,
|
||||||
metrics=metrics
|
metrics=metrics,
|
||||||
|
textequiv_level=textequiv_level
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add reports to the workspace
|
# Add reports to the workspace
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue