diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index f99a9b1..56af085 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -148,10 +148,7 @@ class ExtractedText: @property def text(self): if self._text is not None: - if self._text == '': - return None - else: - return self._text + return self._text else: return self.joiner.join(s.text for s in self.segments) diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 3f73406..9a9f058 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -82,7 +82,7 @@ def page_extract(tree, *, textequiv_level='region'): regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) # Filter empty region texts - regions = [r for r in regions if r.text is not None] + regions = [r for r in regions if r.text != ''] return ExtractedText(None, regions, '\n', None) diff --git a/qurator/dinglehopper/ocrd-tool.json b/qurator/dinglehopper/ocrd-tool.json index 1f009ae..1e2b9b0 100644 --- a/qurator/dinglehopper/ocrd-tool.json +++ b/qurator/dinglehopper/ocrd-tool.json @@ -22,6 +22,12 @@ "type": "boolean", "default": true, "description": "Enable/disable metrics and green/red" + }, + "textequiv_level": { + "type": "string", + "enum": ["region", "line"], + "default": "region", + "description": "PAGE XML hierarchy level to extract the text from" } } } diff --git a/qurator/dinglehopper/ocrd_cli.py b/qurator/dinglehopper/ocrd_cli.py index d1e127b..89f04d9 100644 --- a/qurator/dinglehopper/ocrd_cli.py +++ b/qurator/dinglehopper/ocrd_cli.py @@ -32,6 +32,7 @@ class OcrdDinglehopperEvaluate(Processor): log = getLogger('processor.OcrdDinglehopperEvaluate') metrics = self.parameter['metrics'] + textequiv_level = self.parameter['textequiv_level'] gt_grp, ocr_grp = self.input_file_grp.split(',') for n, page_id in enumerate(self.workspace.mets.physical_pages): gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)) @@ -52,7 +53,8 @@ class OcrdDinglehopperEvaluate(Processor): gt_file.local_filename, ocr_file.local_filename, report_prefix, - metrics=metrics + metrics=metrics, + textequiv_level=textequiv_level ) # Add reports to the workspace