mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-22 22:14:19 +02:00 
			
		
		
		
	Port to OCR-D/core API v3
This commit is contained in:
		
							parent
							
								
									bf6633be02
								
							
						
					
					
						commit
						63031b30bf
					
				
					 5 changed files with 84 additions and 73 deletions
				
			
		
							
								
								
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,5 @@ | ||||||
|  | src/dinglehopper/tests | ||||||
|  | dist | ||||||
|  | build | ||||||
|  | *.egg-info | ||||||
|  | .git | ||||||
							
								
								
									
										33
									
								
								Dockerfile
									
										
									
									
									
								
							
							
						
						
									
										33
									
								
								Dockerfile
									
										
									
									
									
								
							|  | @ -6,17 +6,30 @@ LABEL \ | ||||||
|     maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ |     maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ | ||||||
|     org.label-schema.vcs-ref=$VCS_REF \ |     org.label-schema.vcs-ref=$VCS_REF \ | ||||||
|     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ |     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ | ||||||
|     org.label-schema.build-date=$BUILD_DATE |     org.label-schema.build-date=$BUILD_DATE \ | ||||||
|  |     org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ | ||||||
|  |     org.opencontainers.image.title="dinglehopper" \ | ||||||
|  |     org.opencontainers.image.description="The OCR evaluation tool" \ | ||||||
|  |     org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ | ||||||
|  |     org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ | ||||||
|  |     org.opencontainers.image.revision=$VCS_REF \ | ||||||
|  |     org.opencontainers.image.created=$BUILD_DATE \ | ||||||
|  |     org.opencontainers.image.base.name=ocrd/core | ||||||
|  | 
 | ||||||
|  | ENV LANG=C.UTF-8 | ||||||
|  | ENV LC_ALL=C.UTF-8 | ||||||
|  | 
 | ||||||
|  | # avoid HOME/.local/share (hard to predict USER here) | ||||||
|  | # so let XDG_DATA_HOME coincide with fixed system location | ||||||
|  | # (can still be overridden by derived stages) | ||||||
|  | ENV XDG_DATA_HOME /usr/local/share | ||||||
|  | # avoid the need for an extra volume for persistent resource user db | ||||||
|  | # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) | ||||||
|  | ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources | ||||||
| 
 | 
 | ||||||
| WORKDIR /build/dinglehopper | WORKDIR /build/dinglehopper | ||||||
| COPY pyproject.toml . | COPY . . | ||||||
| COPY src/dinglehopper/ocrd-tool.json . | RUN make install && rm -rf /build/dinglehopper | ||||||
| COPY src ./src |  | ||||||
| COPY requirements.txt . |  | ||||||
| COPY README.md . |  | ||||||
| COPY Makefile . |  | ||||||
| RUN make install |  | ||||||
| RUN rm -rf /build/dinglehopper |  | ||||||
| 
 | 
 | ||||||
| WORKDIR /data | WORKDIR /data | ||||||
| VOLUME ["/data"] | VOLUME /data | ||||||
|  |  | ||||||
							
								
								
									
										9
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										9
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -1,8 +1,9 @@ | ||||||
| PYTHON = python3 | PYTHON = python3 | ||||||
| PIP = pip3 | PIP = pip3 | ||||||
| PYTHONIOENCODING=utf8 | PYTHONIOENCODING=utf8 | ||||||
|  | PYTEST_ARGS = -vv | ||||||
| 
 | 
 | ||||||
| DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 | DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 | ||||||
| DOCKER_TAG = ocrd/dinglehopper | DOCKER_TAG = ocrd/dinglehopper | ||||||
| 
 | 
 | ||||||
| help: | help: | ||||||
|  | @ -16,6 +17,12 @@ help: | ||||||
| install: | install: | ||||||
| 	$(PIP) install . | 	$(PIP) install . | ||||||
| 
 | 
 | ||||||
|  | install-dev: | ||||||
|  | 	$(PIP) install -e . | ||||||
|  | 
 | ||||||
|  | test: | ||||||
|  | 	pytest $(PYTEST_ARGS) | ||||||
|  | 
 | ||||||
| docker: | docker: | ||||||
| 	docker build \
 | 	docker build \
 | ||||||
| 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | ||||||
|  |  | ||||||
|  | @ -1,17 +1,13 @@ | ||||||
| { | { | ||||||
|   "version": "0.9.7", |   "version": "0.9.7", | ||||||
|   "git_url": "https://github.com/qurator-spk/dinglehopper", |   "git_url": "https://github.com/qurator-spk/dinglehopper", | ||||||
|  |   "dockerhub": "ocrd/dinglehopper", | ||||||
|   "tools": { |   "tools": { | ||||||
|     "ocrd-dinglehopper": { |     "ocrd-dinglehopper": { | ||||||
|       "executable": "ocrd-dinglehopper", |       "executable": "ocrd-dinglehopper", | ||||||
|  |       "input_file_grp_cardinality": 2, | ||||||
|  |       "output_file_grp_cardinality": 1, | ||||||
|       "description": "Evaluate OCR text against ground truth with dinglehopper", |       "description": "Evaluate OCR text against ground truth with dinglehopper", | ||||||
|       "input_file_grp": [ |  | ||||||
|         "OCR-D-GT-PAGE", |  | ||||||
|         "OCR-D-OCR" |  | ||||||
|       ], |  | ||||||
|       "output_file_grp": [ |  | ||||||
|         "OCR-D-OCR-EVAL" |  | ||||||
|       ], |  | ||||||
|       "categories": [ |       "categories": [ | ||||||
|         "Quality assurance" |         "Quality assurance" | ||||||
|       ], |       ], | ||||||
|  |  | ||||||
|  | @ -1,83 +1,73 @@ | ||||||
| import json | from functools import cached_property | ||||||
| import os | import os | ||||||
|  | from typing import Optional | ||||||
| 
 | 
 | ||||||
| import click | import click | ||||||
| import importlib_resources | from ocrd_models import OcrdFileType | ||||||
| from ocrd import Processor | from ocrd import Processor | ||||||
| from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | ||||||
| from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id | from ocrd_utils import make_file_id | ||||||
| 
 | 
 | ||||||
| from .cli import process as cli_process | from .cli import process as cli_process | ||||||
| 
 | 
 | ||||||
| OCRD_TOOL = json.loads( |  | ||||||
|     importlib_resources.files(__name__) |  | ||||||
|     .joinpath("ocrd-tool.json") |  | ||||||
|     .read_text(encoding="utf-8", errors="strict") |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @click.command() | @click.command() | ||||||
| @ocrd_cli_options | @ocrd_cli_options | ||||||
| def ocrd_dinglehopper(*args, **kwargs): | def ocrd_dinglehopper(*args, **kwargs): | ||||||
|     return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) |     return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| class OcrdDinglehopperEvaluate(Processor): | class OcrdDinglehopperEvaluate(Processor): | ||||||
|     def __init__(self, *args, **kwargs): |  | ||||||
|         kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] |  | ||||||
|         kwargs["version"] = OCRD_TOOL["version"] |  | ||||||
|         super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) |  | ||||||
| 
 | 
 | ||||||
|     def process(self): |     @cached_property | ||||||
|         assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") |     def executable(self): | ||||||
|         assert_file_grp_cardinality(self.output_file_grp, 1) |         return 'ocrd-dinglehopper' | ||||||
| 
 | 
 | ||||||
|         log = getLogger("processor.OcrdDinglehopperEvaluate") |     def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: | ||||||
| 
 | 
 | ||||||
|  |         assert self.parameter | ||||||
|         metrics = self.parameter["metrics"] |         metrics = self.parameter["metrics"] | ||||||
|         textequiv_level = self.parameter["textequiv_level"] |         textequiv_level = self.parameter["textequiv_level"] | ||||||
|         gt_grp, ocr_grp = self.input_file_grp.split(",") |  | ||||||
| 
 | 
 | ||||||
|         input_file_tuples = self.zip_input_files(on_error="abort") |         try: | ||||||
|         for n, (gt_file, ocr_file) in enumerate(input_file_tuples): |             gt_file, ocr_file = input_files | ||||||
|             if not gt_file or not ocr_file: |             assert gt_file, 'missing GT file' | ||||||
|                 # file/page was not found in this group |             assert ocr_file, 'missing OCR file' | ||||||
|                 continue |             assert gt_file.local_filename | ||||||
|             gt_file = self.workspace.download_file(gt_file) |             assert ocr_file.local_filename | ||||||
|             ocr_file = self.workspace.download_file(ocr_file) |         except (ValueError, AssertionError) as err: | ||||||
|             page_id = gt_file.pageId |             self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? | ||||||
|  |             return | ||||||
| 
 | 
 | ||||||
|             log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) |         page_id = gt_file.pageId | ||||||
| 
 | 
 | ||||||
|             file_id = make_file_id(ocr_file, self.output_file_grp) |         file_id = make_file_id(ocr_file, self.output_file_grp) | ||||||
|             report_prefix = os.path.join(self.output_file_grp, file_id) |         report_prefix = os.path.join(self.output_file_grp, file_id) | ||||||
| 
 | 
 | ||||||
|             # Process the files |         # Process the files | ||||||
|             try: |         try: | ||||||
|                 os.mkdir(self.output_file_grp) |             os.mkdir(self.output_file_grp) | ||||||
|             except FileExistsError: |         except FileExistsError: | ||||||
|                 pass |             pass | ||||||
|             cli_process( |         cli_process( | ||||||
|                 gt_file.local_filename, |             gt_file.local_filename, | ||||||
|                 ocr_file.local_filename, |             ocr_file.local_filename, | ||||||
|                 report_prefix, |             report_prefix, | ||||||
|                 metrics=metrics, |             metrics=metrics, | ||||||
|                 textequiv_level=textequiv_level, |             textequiv_level=textequiv_level, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Add reports to the workspace | ||||||
|  |         for report_suffix, mimetype in [ | ||||||
|  |             [".html", "text/html"], | ||||||
|  |             [".json", "application/json"], | ||||||
|  |         ]: | ||||||
|  |             self.workspace.add_file( | ||||||
|  |                 file_id=file_id + report_suffix, | ||||||
|  |                 file_grp=self.output_file_grp, | ||||||
|  |                 page_id=page_id, | ||||||
|  |                 mimetype=mimetype, | ||||||
|  |                 local_filename=report_prefix + report_suffix, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             # Add reports to the workspace |  | ||||||
|             for report_suffix, mimetype in [ |  | ||||||
|                 [".html", "text/html"], |  | ||||||
|                 [".json", "application/json"], |  | ||||||
|             ]: |  | ||||||
|                 self.workspace.add_file( |  | ||||||
|                     file_id=file_id + report_suffix, |  | ||||||
|                     file_grp=self.output_file_grp, |  | ||||||
|                     page_id=page_id, |  | ||||||
|                     mimetype=mimetype, |  | ||||||
|                     local_filename=report_prefix + report_suffix, |  | ||||||
|                 ) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     ocrd_dinglehopper() |     ocrd_dinglehopper() | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue