mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-29 00:14:17 +01:00 
			
		
		
		
	✨ dinglehopper: Add OCR-D parameter to choose TextEquiv level
This commit is contained in:
		
							parent
							
								
									9744fa2567
								
							
						
					
					
						commit
						b23e4ce30e
					
				
					 4 changed files with 11 additions and 6 deletions
				
			
		|  | @ -148,9 +148,6 @@ class ExtractedText: | |||
|     @property | ||||
|     def text(self): | ||||
|         if self._text is not None: | ||||
|             if self._text == '': | ||||
|                 return None | ||||
|             else: | ||||
|             return self._text | ||||
|         else: | ||||
|             return self.joiner.join(s.text for s in self.segments) | ||||
|  |  | |||
|  | @ -82,7 +82,7 @@ def page_extract(tree, *, textequiv_level='region'): | |||
|             regions.append(ExtractedText.from_text_segment(region, nsmap, textequiv_level=textequiv_level)) | ||||
| 
 | ||||
|     # Filter empty region texts | ||||
|     regions = [r for r in regions if r.text is not None] | ||||
|     regions = [r for r in regions if r.text != ''] | ||||
| 
 | ||||
|     return ExtractedText(None, regions, '\n', None) | ||||
| 
 | ||||
|  |  | |||
|  | @ -22,6 +22,12 @@ | |||
|           "type": "boolean", | ||||
|           "default": true, | ||||
|           "description": "Enable/disable metrics and green/red" | ||||
|         }, | ||||
|         "textequiv_level": { | ||||
|           "type": "string", | ||||
|           "enum": ["region", "line"], | ||||
|           "default": "region", | ||||
|           "description": "PAGE XML hierarchy level to extract the text from" | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|  |  | |||
|  | @ -32,6 +32,7 @@ class OcrdDinglehopperEvaluate(Processor): | |||
|         log = getLogger('processor.OcrdDinglehopperEvaluate') | ||||
| 
 | ||||
|         metrics = self.parameter['metrics'] | ||||
|         textequiv_level = self.parameter['textequiv_level'] | ||||
|         gt_grp, ocr_grp = self.input_file_grp.split(',') | ||||
|         for n, page_id in enumerate(self.workspace.mets.physical_pages): | ||||
|             gt_file = next(self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)) | ||||
|  | @ -52,7 +53,8 @@ class OcrdDinglehopperEvaluate(Processor): | |||
|                     gt_file.local_filename, | ||||
|                     ocr_file.local_filename, | ||||
|                     report_prefix, | ||||
|                     metrics=metrics | ||||
|                     metrics=metrics, | ||||
|                     textequiv_level=textequiv_level | ||||
|             ) | ||||
| 
 | ||||
|             # Add reports to the workspace | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue