mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-25 15:34:15 +02:00 
			
		
		
		
	Allow disabling the html report.
This commit is contained in:
		
							parent
							
								
									e8ccffb275
								
							
						
					
					
						commit
						7642a53091
					
				
					 6 changed files with 60 additions and 14 deletions
				
			
		|  | @ -46,12 +46,15 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] | |||
|   The metrics can be chosen via a comma separated combination of their acronyms | ||||
|   like "--metrics=ca,wer,boc,bow". | ||||
| 
 | ||||
|   The html report can be enabled/disabled using --html/--no-html. | ||||
| 
 | ||||
|   By default, the text of PAGE files is extracted on 'region' level. You may | ||||
|   use "--textequiv-level line" to extract from the level of TextLine tags. | ||||
| 
 | ||||
| Options: | ||||
|   --metrics                 Enable different metrics like ca|cer, wa|wer, boc and bow. | ||||
|   --textequiv-level LEVEL   PAGE TextEquiv level to extract text from | ||||
|   --html / --no-html        Enabling/disabling html report. | ||||
|   --progress                Show progress bar | ||||
|   --help                    Show this message and exit. | ||||
| ~~~ | ||||
|  | @ -85,10 +88,11 @@ The OCR-D processor has these parameters: | |||
| | ------------------------- | ------------------------------------------------------------------- | | ||||
| | `-P metrics cer,wer`      | Enable character error rate and word error rate (default)           | | ||||
| | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) | | ||||
| | `-P html false`           | Enabling/disabling html report (default: enabled).                  | | ||||
| 
 | ||||
| For example: | ||||
| ~~~ | ||||
| ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer | ||||
| ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false | ||||
| ~~~ | ||||
| 
 | ||||
| Developer information | ||||
|  |  | |||
|  | @ -144,7 +144,9 @@ def generate_json_report(gt, ocr, report_prefix, metrics_results): | |||
|         json.dump(json_dict, fp) | ||||
| 
 | ||||
| 
 | ||||
| def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"): | ||||
| def process( | ||||
|     gt, ocr, report_prefix, *, html=True, metrics="cer,wer", textequiv_level="region" | ||||
| ): | ||||
|     """Check OCR result against GT. | ||||
| 
 | ||||
|     The @click decorators change the signature of the decorated functions, | ||||
|  | @ -172,6 +174,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio | |||
|             metrics_results[result.metric] = result | ||||
| 
 | ||||
|     generate_json_report(gt, ocr, report_prefix, metrics_results) | ||||
|     if html: | ||||
|         generate_html_report(gt, ocr, gt_text, ocr_text, report_prefix, metrics_results) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -179,6 +182,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio | |||
| @click.argument("gt", type=click.Path(exists=True)) | ||||
| @click.argument("ocr", type=click.Path(exists=True)) | ||||
| @click.argument("report_prefix", type=click.Path(), default="report") | ||||
| @click.option("--html", default=True, is_flag=True, help="Enable/disable html report.") | ||||
| @click.option( | ||||
|     "--metrics", | ||||
|     default="cer,wer", | ||||
|  | @ -191,7 +195,7 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio | |||
|     metavar="LEVEL", | ||||
| ) | ||||
| @click.option("--progress", default=False, is_flag=True, help="Show progress bar") | ||||
| def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): | ||||
| def main(gt, ocr, report_prefix, html, metrics, textequiv_level, progress): | ||||
|     """ | ||||
|     Compare the PAGE/ALTO/text document GT against the document OCR. | ||||
| 
 | ||||
|  | @ -210,12 +214,21 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): | |||
|     The metrics can be chosen via a comma separated combination of their acronyms | ||||
|     like "--metrics=ca,wer,boc,bow". | ||||
| 
 | ||||
|     The html report can be enabled/disabled using --html / --no-html. | ||||
| 
 | ||||
|     By default, the text of PAGE files is extracted on 'region' level. You may | ||||
|     use "--textequiv-level line" to extract from the level of TextLine tags. | ||||
|     """ | ||||
|     initLogging() | ||||
|     Config.progress = progress | ||||
|     process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) | ||||
|     process( | ||||
|         gt, | ||||
|         ocr, | ||||
|         report_prefix, | ||||
|         html=html, | ||||
|         metrics=metrics, | ||||
|         textequiv_level=textequiv_level, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  |  | |||
|  | @ -18,6 +18,11 @@ | |||
|         "recognition/text-recognition" | ||||
|       ], | ||||
|       "parameters": { | ||||
|         "html": { | ||||
|           "type": "boolean", | ||||
|           "default": true, | ||||
|           "description": "Enable/disable html report." | ||||
|         }, | ||||
|         "metrics": { | ||||
|           "type": "string", | ||||
|           "enum": ["", "boc", "boc,bow", "bow", "ca", "ca,boc", "ca,boc,bow", "ca,bow", "ca,wa", "ca,wa,boc", "ca,wa,boc,bow", "ca,wa,bow", "ca,wer", "ca,wer,boc", "ca,wer,boc,bow", "ca,wer,bow", "cer", "cer,boc", "cer,boc,bow", "cer,bow", "cer,wa", "cer,wa,boc", "cer,wa,boc,bow", "cer,wa,bow", "cer,wer", "cer,wer,boc", "cer,wer,boc,bow", "cer,wer,bow", "wa", "wa,boc", "wa,boc,bow", "wa,bow", "wer", "wer,boc", "wer,boc,bow", "wer,bow"], | ||||
|  | @ -28,7 +33,7 @@ | |||
|           "type": "string", | ||||
|           "enum": ["region", "line"], | ||||
|           "default": "region", | ||||
|           "description": "PAGE XML hierarchy level to extract the text from" | ||||
|           "description": "PAGE XML hierarchy level to extract the text from." | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|  |  | |||
|  | @ -30,6 +30,7 @@ class OcrdDinglehopperEvaluate(Processor): | |||
| 
 | ||||
|         log = getLogger("processor.OcrdDinglehopperEvaluate") | ||||
| 
 | ||||
|         html = self.parameter["html"] | ||||
|         metrics = self.parameter["metrics"] | ||||
|         textequiv_level = self.parameter["textequiv_level"] | ||||
|         gt_grp, ocr_grp = self.input_file_grp.split(",") | ||||
|  | @ -57,15 +58,16 @@ class OcrdDinglehopperEvaluate(Processor): | |||
|                 gt_file.local_filename, | ||||
|                 ocr_file.local_filename, | ||||
|                 report_prefix, | ||||
|                 html=html, | ||||
|                 metrics=metrics, | ||||
|                 textequiv_level=textequiv_level, | ||||
|             ) | ||||
| 
 | ||||
|             # Add reports to the workspace | ||||
|             for report_suffix, mimetype in [ | ||||
|                 [".html", "text/html"], | ||||
|                 [".json", "application/json"], | ||||
|             ]: | ||||
|             report_types = [(".json", "application/json")] | ||||
|             if html: | ||||
|                 report_types.append((".html", "text/html")) | ||||
|             for report_suffix, mimetype in report_types: | ||||
|                 self.workspace.add_file( | ||||
|                     ID=file_id + report_suffix, | ||||
|                     file_grp=self.output_file_grp, | ||||
|  |  | |||
|  | @ -1,3 +1,5 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from .util import working_directory | ||||
|  | @ -16,3 +18,19 @@ def test_cli_unknown_metric(tmp_path): | |||
| 
 | ||||
|         with pytest.raises(ValueError, match="Unknown metric 'unknown'."): | ||||
|             process("gt.txt", "ocr.txt", "report", metrics="cer,unknown") | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.integration | ||||
| def test_cli_html_report_parameter(tmp_path): | ||||
|     """Test that html report can get turned off.""" | ||||
| 
 | ||||
|     with working_directory(str(tmp_path)): | ||||
|         with open("gt.txt", "w") as gtf: | ||||
|             gtf.write("") | ||||
|         with open("ocr.txt", "w") as ocrf: | ||||
|             ocrf.write("") | ||||
| 
 | ||||
|         process("gt.txt", "ocr.txt", "report", html=False) | ||||
| 
 | ||||
|         assert Path("report").with_suffix(".json").exists() | ||||
|         assert not Path("report").with_suffix(".html").exists() | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| import json | ||||
| from itertools import combinations | ||||
| from pathlib import Path | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
|  | @ -45,11 +46,10 @@ def test_cli_json(metrics, tmp_path): | |||
|         with open("ocr.txt", "w") as ocrf: | ||||
|             ocrf.write("AAAAB") | ||||
| 
 | ||||
|         with open("gt.txt", "r") as gtf: | ||||
|             print(gtf.read()) | ||||
|         process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics)) | ||||
|         with open("report.json", "r") as jsonf: | ||||
|             print(jsonf.read()) | ||||
| 
 | ||||
|         for suffix in (".json", ".html"): | ||||
|             assert Path("report").with_suffix(suffix).exists() | ||||
| 
 | ||||
|         with open("report.json", "r") as jsonf: | ||||
|             j = json.load(jsonf) | ||||
|  | @ -76,6 +76,10 @@ def test_cli_json_extremes(gt, ocr, err, tmp_path): | |||
|             ocrf.write(ocr) | ||||
| 
 | ||||
|         process("gt.txt", "ocr.txt", "report", metrics="ca,wa,boc,bow") | ||||
| 
 | ||||
|         for suffix in (".json", ".html"): | ||||
|             assert Path("report").with_suffix(suffix).exists() | ||||
| 
 | ||||
|         with open("report.json", "r") as jsonf: | ||||
|             j = json.load(jsonf) | ||||
|             for metric in set(METRIC_DICT.values()): | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue