mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-25 15:34:15 +02:00 
			
		
		
		
	Merge commit '7930ecd428'
				
					
				
			This commit is contained in:
		
						commit
						717801bdbb
					
				
					 9 changed files with 89 additions and 17 deletions
				
			
		
							
								
								
									
										7
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							|  | @ -1,3 +1,10 @@ | ||||||
|  | # Byte-compiled / optimized / DLL files | ||||||
|  | __pycache__/ | ||||||
|  | *.py[cod] | ||||||
|  | 
 | ||||||
|  | # Distribution / packaging | ||||||
|  | *.egg-info/ | ||||||
|  | 
 | ||||||
| # User-specific stuff | # User-specific stuff | ||||||
| .idea/**/workspace.xml | .idea/**/workspace.xml | ||||||
| .idea/**/tasks.xml | .idea/**/tasks.xml | ||||||
|  |  | ||||||
							
								
								
									
										5
									
								
								.idea/codeStyles/codeStyleConfig.xml
									
										
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.idea/codeStyles/codeStyleConfig.xml
									
										
									
										generated
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,5 @@ | ||||||
|  | <component name="ProjectCodeStyleConfiguration"> | ||||||
|  |   <state> | ||||||
|  |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" /> | ||||||
|  |   </state> | ||||||
|  | </component> | ||||||
							
								
								
									
										31
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										31
									
								
								README.md
									
										
									
									
									
								
							|  | @ -1,7 +1,11 @@ | ||||||
| dinglehopper | dinglehopper | ||||||
| ============ | ============ | ||||||
| 
 | 
 | ||||||
| dinglehopper is an OCR evaluation tool and reads [ALTO](https://github.com/altoxml), [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. | dinglehopper is an OCR evaluation tool and reads | ||||||
|  | [ALTO](https://github.com/altoxml), | ||||||
|  | [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It | ||||||
|  | compares a ground truth (GT) document page with a OCR result page to compute | ||||||
|  | metrics and a word/character differences report. | ||||||
| 
 | 
 | ||||||
| [](https://travis-ci.org/qurator-spk/dinglehopper) | [](https://travis-ci.org/qurator-spk/dinglehopper) | ||||||
| 
 | 
 | ||||||
|  | @ -23,6 +27,22 @@ sudo pip install . | ||||||
| Usage | Usage | ||||||
| ----- | ----- | ||||||
| ~~~ | ~~~ | ||||||
|  | Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] | ||||||
|  | 
 | ||||||
|  |   Compare the PAGE/ALTO/text document GT against the document OCR. | ||||||
|  | 
 | ||||||
|  |   The files GT and OCR are usually a ground truth document and the result of | ||||||
|  |   an OCR software, but you may use dinglehopper to compare two OCR results. | ||||||
|  |   In that case, use --no-metrics to disable the then meaningless metrics and | ||||||
|  |   also change the color scheme from green/red to blue. | ||||||
|  | 
 | ||||||
|  | Options: | ||||||
|  |   --metrics / --no-metrics  Enable/disable metrics and green/red | ||||||
|  |   --help                    Show this message and exit. | ||||||
|  | ~~~ | ||||||
|  | 
 | ||||||
|  | For example: | ||||||
|  | ~~~ | ||||||
| dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml | dinglehopper some-document.gt.page.xml some-document.ocr.alto.xml | ||||||
| ~~~ | ~~~ | ||||||
| This generates `report.html` and `report.json`. | This generates `report.html` and `report.json`. | ||||||
|  | @ -30,13 +50,20 @@ This generates `report.html` and `report.json`. | ||||||
| 
 | 
 | ||||||
| As a OCR-D processor: | As a OCR-D processor: | ||||||
| ~~~ | ~~~ | ||||||
| ocrd-dinglehopper -m mets.xml -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL | ocrd-dinglehopper -I OCR-D-GT-PAGE,OCR-D-OCR-TESS -O OCR-D-OCR-TESS-EVAL | ||||||
| ~~~ | ~~~ | ||||||
| This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup. | This generates HTML and JSON reports in the `OCR-D-OCR-TESS-EVAL` filegroup. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
| 
 | 
 | ||||||
|  | You may also want to disable metrics and the green-red color scheme by | ||||||
|  | parameter: | ||||||
|  | 
 | ||||||
|  | ~~~ | ||||||
|  | ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -p '{"metrics": false}' | ||||||
|  | ~~~ | ||||||
|  | 
 | ||||||
| Testing | Testing | ||||||
| ------- | ------- | ||||||
| Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests): | Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests): | ||||||
|  |  | ||||||
|  | @ -44,7 +44,7 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): | ||||||
|         '''.format(gtx, ocrx) |         '''.format(gtx, ocrx) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def process(gt, ocr, report_prefix): | def process(gt, ocr, report_prefix, *, metrics=True): | ||||||
|     """Check OCR result against GT. |     """Check OCR result against GT. | ||||||
| 
 | 
 | ||||||
|     The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use |     The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use | ||||||
|  | @ -91,7 +91,8 @@ def process(gt, ocr, report_prefix): | ||||||
|             cer=cer, n_characters=n_characters, |             cer=cer, n_characters=n_characters, | ||||||
|             wer=wer, n_words=n_words, |             wer=wer, n_words=n_words, | ||||||
|             char_diff_report=char_diff_report, |             char_diff_report=char_diff_report, | ||||||
|             word_diff_report=word_diff_report |             word_diff_report=word_diff_report, | ||||||
|  |             metrics=metrics, | ||||||
|         ).dump(out_fn) |         ).dump(out_fn) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -99,8 +100,17 @@ def process(gt, ocr, report_prefix): | ||||||
| @click.argument('gt', type=click.Path(exists=True)) | @click.argument('gt', type=click.Path(exists=True)) | ||||||
| @click.argument('ocr', type=click.Path(exists=True)) | @click.argument('ocr', type=click.Path(exists=True)) | ||||||
| @click.argument('report_prefix', type=click.Path(), default='report') | @click.argument('report_prefix', type=click.Path(), default='report') | ||||||
| def main(gt, ocr, report_prefix): | @click.option('--metrics/--no-metrics', default=True, help='Enable/disable metrics and green/red') | ||||||
|     process(gt, ocr, report_prefix) | def main(gt, ocr, report_prefix, metrics): | ||||||
|  |     """ | ||||||
|  |     Compare the PAGE/ALTO/text document GT against the document OCR. | ||||||
|  | 
 | ||||||
|  |     The files GT and OCR are usually a ground truth document and the result of | ||||||
|  |     an OCR software, but you may use dinglehopper to compare two OCR results. In | ||||||
|  |     that case, use --no-metrics to disable the then meaningless metrics and also | ||||||
|  |     change the color scheme from green/red to blue. | ||||||
|  |     """ | ||||||
|  |     process(gt, ocr, report_prefix, metrics=metrics) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|  |  | ||||||
|  | @ -16,7 +16,14 @@ | ||||||
|       ], |       ], | ||||||
|       "steps": [ |       "steps": [ | ||||||
|         "recognition/text-recognition" |         "recognition/text-recognition" | ||||||
|       ] |       ], | ||||||
|  |       "parameters": { | ||||||
|  |         "metrics": { | ||||||
|  |           "type": "boolean", | ||||||
|  |           "default": true, | ||||||
|  |           "description": "Enable/disable metrics and green/red" | ||||||
|  |         } | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -4,7 +4,7 @@ import os | ||||||
| import click | import click | ||||||
| from ocrd import Processor | from ocrd import Processor | ||||||
| from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | ||||||
| from ocrd_utils import concat_padded, getLogger | from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality | ||||||
| from pkg_resources import resource_string | from pkg_resources import resource_string | ||||||
| 
 | 
 | ||||||
| from qurator.dinglehopper.cli import process as cli_process | from qurator.dinglehopper.cli import process as cli_process | ||||||
|  | @ -27,20 +27,20 @@ class OcrdDinglehopperEvaluate(Processor): | ||||||
|         kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] |         kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-dinglehopper'] | ||||||
|         super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) |         super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) | ||||||
| 
 | 
 | ||||||
|     def _make_file_id(self, input_file, input_file_grp, n): |  | ||||||
|         file_id = input_file.ID.replace(input_file_grp, self.output_file_grp) |  | ||||||
|         if file_id == input_file.ID: |  | ||||||
|             file_id = concat_padded(self.output_file_grp, n) |  | ||||||
|         return file_id |  | ||||||
| 
 |  | ||||||
|     def process(self): |     def process(self): | ||||||
|  |         assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR') | ||||||
|  |         assert_file_grp_cardinality(self.output_file_grp, 1) | ||||||
|  | 
 | ||||||
|  |         metrics = self.parameter['metrics'] | ||||||
|         gt_grp, ocr_grp = self.input_file_grp.split(',') |         gt_grp, ocr_grp = self.input_file_grp.split(',') | ||||||
|         for n, page_id in enumerate(self.workspace.mets.physical_pages): |         for n, page_id in enumerate(self.workspace.mets.physical_pages): | ||||||
|             gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0] |             gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0] | ||||||
|             ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0] |             ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0] | ||||||
|  |             gt_file = self.workspace.download_file(gt_file) | ||||||
|  |             ocr_file = self.workspace.download_file(ocr_file) | ||||||
|             log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) |             log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) | ||||||
| 
 | 
 | ||||||
|             file_id = self._make_file_id(ocr_file, ocr_grp, n) |             file_id = make_file_id(ocr_file, self.output_file_grp) | ||||||
|             report_prefix = os.path.join(self.output_file_grp, file_id) |             report_prefix = os.path.join(self.output_file_grp, file_id) | ||||||
| 
 | 
 | ||||||
|             # Process the files |             # Process the files | ||||||
|  | @ -48,7 +48,12 @@ class OcrdDinglehopperEvaluate(Processor): | ||||||
|                 os.mkdir(self.output_file_grp) |                 os.mkdir(self.output_file_grp) | ||||||
|             except FileExistsError: |             except FileExistsError: | ||||||
|                 pass |                 pass | ||||||
|             cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix) |             cli_process( | ||||||
|  |                     gt_file.local_filename, | ||||||
|  |                     ocr_file.local_filename, | ||||||
|  |                     report_prefix, | ||||||
|  |                     metrics=metrics | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|             # Add reports to the workspace |             # Add reports to the workspace | ||||||
|             for report_suffix, mimetype in \ |             for report_suffix, mimetype in \ | ||||||
|  |  | ||||||
|  | @ -6,12 +6,18 @@ | ||||||
| 
 | 
 | ||||||
|     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> |     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> | ||||||
|     <style type="text/css"> |     <style type="text/css"> | ||||||
|  |     {% if metrics %} | ||||||
|     .gt .diff { |     .gt .diff { | ||||||
|         color: green; |         color: green; | ||||||
|     } |     } | ||||||
|     .ocr .diff { |     .ocr .diff { | ||||||
|         color: red; |         color: red; | ||||||
|     } |     } | ||||||
|  |     {% else %} | ||||||
|  |     .gt .diff, .ocr .diff { | ||||||
|  |         color: blue; | ||||||
|  |     } | ||||||
|  |     {% endif %} | ||||||
|     .ellipsis { |     .ellipsis { | ||||||
|         opacity: 0.5; |         opacity: 0.5; | ||||||
|         font-style: italic; |         font-style: italic; | ||||||
|  | @ -32,9 +38,11 @@ | ||||||
| {{ ocr }} | {{ ocr }} | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | {% if metrics %} | ||||||
| <h2>Metrics</h2> | <h2>Metrics</h2> | ||||||
| <p>CER: {{ cer|round(4) }}</p> | <p>CER: {{ cer|round(4) }}</p> | ||||||
| <p>WER: {{ wer|round(4) }}</p> | <p>WER: {{ wer|round(4) }}</p> | ||||||
|  | {% endif %} | ||||||
| 
 | 
 | ||||||
| <h2>Character differences</h2> | <h2>Character differences</h2> | ||||||
| {{ char_diff_report }} | {{ char_diff_report }} | ||||||
|  |  | ||||||
|  | @ -4,4 +4,5 @@ lxml | ||||||
| uniseg | uniseg | ||||||
| numpy | numpy | ||||||
| colorama | colorama | ||||||
| ocrd >= 1.0.0b15 | MarkupSafe | ||||||
|  | ocrd >= 2.13.1 | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								setup.cfg
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								setup.cfg
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,2 @@ | ||||||
|  | [flake8] | ||||||
|  | max-line-length = 90 | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue