mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-11-04 11:24:17 +01:00 
			
		
		
		
	Port to OCR-D/core API v3
This commit is contained in:
		
							parent
							
								
									071e6a8bd1
								
							
						
					
					
						commit
						eb4b247b37
					
				
					 5 changed files with 84 additions and 73 deletions
				
			
		
							
								
								
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,5 @@
 | 
				
			||||||
 | 
					src/dinglehopper/tests
 | 
				
			||||||
 | 
					dist
 | 
				
			||||||
 | 
					build
 | 
				
			||||||
 | 
					*.egg-info
 | 
				
			||||||
 | 
					.git
 | 
				
			||||||
							
								
								
									
										33
									
								
								Dockerfile
									
										
									
									
									
								
							
							
						
						
									
										33
									
								
								Dockerfile
									
										
									
									
									
								
							| 
						 | 
					@ -6,17 +6,30 @@ LABEL \
 | 
				
			||||||
    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
 | 
					    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
 | 
				
			||||||
    org.label-schema.vcs-ref=$VCS_REF \
 | 
					    org.label-schema.vcs-ref=$VCS_REF \
 | 
				
			||||||
    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
 | 
					    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
 | 
				
			||||||
    org.label-schema.build-date=$BUILD_DATE
 | 
					    org.label-schema.build-date=$BUILD_DATE \
 | 
				
			||||||
 | 
					    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
 | 
				
			||||||
 | 
					    org.opencontainers.image.title="dinglehopper" \
 | 
				
			||||||
 | 
					    org.opencontainers.image.description="The OCR evaluation tool" \
 | 
				
			||||||
 | 
					    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
 | 
				
			||||||
 | 
					    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
 | 
				
			||||||
 | 
					    org.opencontainers.image.revision=$VCS_REF \
 | 
				
			||||||
 | 
					    org.opencontainers.image.created=$BUILD_DATE \
 | 
				
			||||||
 | 
					    org.opencontainers.image.base.name=ocrd/core
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ENV LANG=C.UTF-8
 | 
				
			||||||
 | 
					ENV LC_ALL=C.UTF-8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# avoid HOME/.local/share (hard to predict USER here)
 | 
				
			||||||
 | 
					# so let XDG_DATA_HOME coincide with fixed system location
 | 
				
			||||||
 | 
					# (can still be overridden by derived stages)
 | 
				
			||||||
 | 
					ENV XDG_DATA_HOME /usr/local/share
 | 
				
			||||||
 | 
					# avoid the need for an extra volume for persistent resource user db
 | 
				
			||||||
 | 
					# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
 | 
				
			||||||
 | 
					ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 | 
				
			||||||
 | 
					
 | 
				
			||||||
WORKDIR /build/dinglehopper
 | 
					WORKDIR /build/dinglehopper
 | 
				
			||||||
COPY pyproject.toml .
 | 
					COPY . .
 | 
				
			||||||
COPY src/dinglehopper/ocrd-tool.json .
 | 
					RUN make install && rm -rf /build/dinglehopper
 | 
				
			||||||
COPY src ./src
 | 
					 | 
				
			||||||
COPY requirements.txt .
 | 
					 | 
				
			||||||
COPY README.md .
 | 
					 | 
				
			||||||
COPY Makefile .
 | 
					 | 
				
			||||||
RUN make install
 | 
					 | 
				
			||||||
RUN rm -rf /build/dinglehopper
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
WORKDIR /data
 | 
					WORKDIR /data
 | 
				
			||||||
VOLUME ["/data"]
 | 
					VOLUME /data
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										9
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										9
									
								
								Makefile
									
										
									
									
									
								
							| 
						 | 
					@ -1,8 +1,9 @@
 | 
				
			||||||
PYTHON = python3
 | 
					PYTHON = python3
 | 
				
			||||||
PIP = pip3
 | 
					PIP = pip3
 | 
				
			||||||
PYTHONIOENCODING=utf8
 | 
					PYTHONIOENCODING=utf8
 | 
				
			||||||
 | 
					PYTEST_ARGS = -vv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
 | 
					DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 | 
				
			||||||
DOCKER_TAG = ocrd/dinglehopper
 | 
					DOCKER_TAG = ocrd/dinglehopper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
help:
 | 
					help:
 | 
				
			||||||
| 
						 | 
					@ -16,6 +17,12 @@ help:
 | 
				
			||||||
install:
 | 
					install:
 | 
				
			||||||
	$(PIP) install .
 | 
						$(PIP) install .
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					install-dev:
 | 
				
			||||||
 | 
						$(PIP) install -e .
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					test:
 | 
				
			||||||
 | 
						pytest $(PYTEST_ARGS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
docker:
 | 
					docker:
 | 
				
			||||||
	docker build \
 | 
						docker build \
 | 
				
			||||||
	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | 
						--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,13 @@
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  "version": "0.9.7",
 | 
					  "version": "0.9.7",
 | 
				
			||||||
  "git_url": "https://github.com/qurator-spk/dinglehopper",
 | 
					  "git_url": "https://github.com/qurator-spk/dinglehopper",
 | 
				
			||||||
 | 
					  "dockerhub": "ocrd/dinglehopper",
 | 
				
			||||||
  "tools": {
 | 
					  "tools": {
 | 
				
			||||||
    "ocrd-dinglehopper": {
 | 
					    "ocrd-dinglehopper": {
 | 
				
			||||||
      "executable": "ocrd-dinglehopper",
 | 
					      "executable": "ocrd-dinglehopper",
 | 
				
			||||||
 | 
					      "input_file_grp_cardinality": 2,
 | 
				
			||||||
 | 
					      "output_file_grp_cardinality": 1,
 | 
				
			||||||
      "description": "Evaluate OCR text against ground truth with dinglehopper",
 | 
					      "description": "Evaluate OCR text against ground truth with dinglehopper",
 | 
				
			||||||
      "input_file_grp": [
 | 
					 | 
				
			||||||
        "OCR-D-GT-PAGE",
 | 
					 | 
				
			||||||
        "OCR-D-OCR"
 | 
					 | 
				
			||||||
      ],
 | 
					 | 
				
			||||||
      "output_file_grp": [
 | 
					 | 
				
			||||||
        "OCR-D-OCR-EVAL"
 | 
					 | 
				
			||||||
      ],
 | 
					 | 
				
			||||||
      "categories": [
 | 
					      "categories": [
 | 
				
			||||||
        "Quality assurance"
 | 
					        "Quality assurance"
 | 
				
			||||||
      ],
 | 
					      ],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,83 +1,73 @@
 | 
				
			||||||
import json
 | 
					from functools import cached_property
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import click
 | 
					import click
 | 
				
			||||||
import importlib_resources
 | 
					from ocrd_models import OcrdFileType
 | 
				
			||||||
from ocrd import Processor
 | 
					from ocrd import Processor
 | 
				
			||||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 | 
					from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 | 
				
			||||||
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
 | 
					from ocrd_utils import make_file_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .cli import process as cli_process
 | 
					from .cli import process as cli_process
 | 
				
			||||||
 | 
					
 | 
				
			||||||
OCRD_TOOL = json.loads(
 | 
					 | 
				
			||||||
    importlib_resources.files(__name__)
 | 
					 | 
				
			||||||
    .joinpath("ocrd-tool.json")
 | 
					 | 
				
			||||||
    .read_text(encoding="utf-8", errors="strict")
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@click.command()
 | 
					@click.command()
 | 
				
			||||||
@ocrd_cli_options
 | 
					@ocrd_cli_options
 | 
				
			||||||
def ocrd_dinglehopper(*args, **kwargs):
 | 
					def ocrd_dinglehopper(*args, **kwargs):
 | 
				
			||||||
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 | 
					    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
class OcrdDinglehopperEvaluate(Processor):
 | 
					class OcrdDinglehopperEvaluate(Processor):
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					 | 
				
			||||||
        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
 | 
					 | 
				
			||||||
        kwargs["version"] = OCRD_TOOL["version"]
 | 
					 | 
				
			||||||
        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def process(self):
 | 
					    @cached_property
 | 
				
			||||||
        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
 | 
					    def executable(self):
 | 
				
			||||||
        assert_file_grp_cardinality(self.output_file_grp, 1)
 | 
					        return 'ocrd-dinglehopper'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        log = getLogger("processor.OcrdDinglehopperEvaluate")
 | 
					    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert self.parameter
 | 
				
			||||||
        metrics = self.parameter["metrics"]
 | 
					        metrics = self.parameter["metrics"]
 | 
				
			||||||
        textequiv_level = self.parameter["textequiv_level"]
 | 
					        textequiv_level = self.parameter["textequiv_level"]
 | 
				
			||||||
        gt_grp, ocr_grp = self.input_file_grp.split(",")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        input_file_tuples = self.zip_input_files(on_error="abort")
 | 
					        try:
 | 
				
			||||||
        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
 | 
					            gt_file, ocr_file = input_files
 | 
				
			||||||
            if not gt_file or not ocr_file:
 | 
					            assert gt_file, 'missing GT file'
 | 
				
			||||||
                # file/page was not found in this group
 | 
					            assert ocr_file, 'missing OCR file'
 | 
				
			||||||
                continue
 | 
					            assert gt_file.local_filename
 | 
				
			||||||
            gt_file = self.workspace.download_file(gt_file)
 | 
					            assert ocr_file.local_filename
 | 
				
			||||||
            ocr_file = self.workspace.download_file(ocr_file)
 | 
					        except (ValueError, AssertionError) as err:
 | 
				
			||||||
            page_id = gt_file.pageId
 | 
					            self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
 | 
					        page_id = gt_file.pageId
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            file_id = make_file_id(ocr_file, self.output_file_grp)
 | 
					        file_id = make_file_id(ocr_file, self.output_file_grp)
 | 
				
			||||||
            report_prefix = os.path.join(self.output_file_grp, file_id)
 | 
					        report_prefix = os.path.join(self.output_file_grp, file_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Process the files
 | 
					        # Process the files
 | 
				
			||||||
            try:
 | 
					        try:
 | 
				
			||||||
                os.mkdir(self.output_file_grp)
 | 
					            os.mkdir(self.output_file_grp)
 | 
				
			||||||
            except FileExistsError:
 | 
					        except FileExistsError:
 | 
				
			||||||
                pass
 | 
					            pass
 | 
				
			||||||
            cli_process(
 | 
					        cli_process(
 | 
				
			||||||
                gt_file.local_filename,
 | 
					            gt_file.local_filename,
 | 
				
			||||||
                ocr_file.local_filename,
 | 
					            ocr_file.local_filename,
 | 
				
			||||||
                report_prefix,
 | 
					            report_prefix,
 | 
				
			||||||
                metrics=metrics,
 | 
					            metrics=metrics,
 | 
				
			||||||
                textequiv_level=textequiv_level,
 | 
					            textequiv_level=textequiv_level,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Add reports to the workspace
 | 
				
			||||||
 | 
					        for report_suffix, mimetype in [
 | 
				
			||||||
 | 
					            [".html", "text/html"],
 | 
				
			||||||
 | 
					            [".json", "application/json"],
 | 
				
			||||||
 | 
					        ]:
 | 
				
			||||||
 | 
					            self.workspace.add_file(
 | 
				
			||||||
 | 
					                file_id=file_id + report_suffix,
 | 
				
			||||||
 | 
					                file_grp=self.output_file_grp,
 | 
				
			||||||
 | 
					                page_id=page_id,
 | 
				
			||||||
 | 
					                mimetype=mimetype,
 | 
				
			||||||
 | 
					                local_filename=report_prefix + report_suffix,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Add reports to the workspace
 | 
					 | 
				
			||||||
            for report_suffix, mimetype in [
 | 
					 | 
				
			||||||
                [".html", "text/html"],
 | 
					 | 
				
			||||||
                [".json", "application/json"],
 | 
					 | 
				
			||||||
            ]:
 | 
					 | 
				
			||||||
                self.workspace.add_file(
 | 
					 | 
				
			||||||
                    file_id=file_id + report_suffix,
 | 
					 | 
				
			||||||
                    file_grp=self.output_file_grp,
 | 
					 | 
				
			||||||
                    page_id=page_id,
 | 
					 | 
				
			||||||
                    mimetype=mimetype,
 | 
					 | 
				
			||||||
                    local_filename=report_prefix + report_suffix,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    ocrd_dinglehopper()
 | 
					    ocrd_dinglehopper()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue