mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-11-04 11:24:17 +01:00 
			
		
		
		
	Port to OCR-D/core API v3
This commit is contained in:
		
							parent
							
								
									071e6a8bd1
								
							
						
					
					
						commit
						eb4b247b37
					
				
					 5 changed files with 84 additions and 73 deletions
				
			
		
							
								
								
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,5 @@
 | 
			
		|||
src/dinglehopper/tests
 | 
			
		||||
dist
 | 
			
		||||
build
 | 
			
		||||
*.egg-info
 | 
			
		||||
.git
 | 
			
		||||
							
								
								
									
										33
									
								
								Dockerfile
									
										
									
									
									
								
							
							
						
						
									
										33
									
								
								Dockerfile
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -6,17 +6,30 @@ LABEL \
 | 
			
		|||
    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
 | 
			
		||||
    org.label-schema.vcs-ref=$VCS_REF \
 | 
			
		||||
    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
 | 
			
		||||
    org.label-schema.build-date=$BUILD_DATE
 | 
			
		||||
    org.label-schema.build-date=$BUILD_DATE \
 | 
			
		||||
    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
 | 
			
		||||
    org.opencontainers.image.title="dinglehopper" \
 | 
			
		||||
    org.opencontainers.image.description="The OCR evaluation tool" \
 | 
			
		||||
    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
 | 
			
		||||
    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
 | 
			
		||||
    org.opencontainers.image.revision=$VCS_REF \
 | 
			
		||||
    org.opencontainers.image.created=$BUILD_DATE \
 | 
			
		||||
    org.opencontainers.image.base.name=ocrd/core
 | 
			
		||||
 | 
			
		||||
ENV LANG=C.UTF-8
 | 
			
		||||
ENV LC_ALL=C.UTF-8
 | 
			
		||||
 | 
			
		||||
# avoid HOME/.local/share (hard to predict USER here)
 | 
			
		||||
# so let XDG_DATA_HOME coincide with fixed system location
 | 
			
		||||
# (can still be overridden by derived stages)
 | 
			
		||||
ENV XDG_DATA_HOME /usr/local/share
 | 
			
		||||
# avoid the need for an extra volume for persistent resource user db
 | 
			
		||||
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
 | 
			
		||||
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 | 
			
		||||
 | 
			
		||||
WORKDIR /build/dinglehopper
 | 
			
		||||
COPY pyproject.toml .
 | 
			
		||||
COPY src/dinglehopper/ocrd-tool.json .
 | 
			
		||||
COPY src ./src
 | 
			
		||||
COPY requirements.txt .
 | 
			
		||||
COPY README.md .
 | 
			
		||||
COPY Makefile .
 | 
			
		||||
RUN make install
 | 
			
		||||
RUN rm -rf /build/dinglehopper
 | 
			
		||||
COPY . .
 | 
			
		||||
RUN make install && rm -rf /build/dinglehopper
 | 
			
		||||
 | 
			
		||||
WORKDIR /data
 | 
			
		||||
VOLUME ["/data"]
 | 
			
		||||
VOLUME /data
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										9
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										9
									
								
								Makefile
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,8 +1,9 @@
 | 
			
		|||
PYTHON = python3
 | 
			
		||||
PIP = pip3
 | 
			
		||||
PYTHONIOENCODING=utf8
 | 
			
		||||
PYTEST_ARGS = -vv
 | 
			
		||||
 | 
			
		||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
 | 
			
		||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 | 
			
		||||
DOCKER_TAG = ocrd/dinglehopper
 | 
			
		||||
 | 
			
		||||
help:
 | 
			
		||||
| 
						 | 
				
			
			@ -16,6 +17,12 @@ help:
 | 
			
		|||
install:
 | 
			
		||||
	$(PIP) install .
 | 
			
		||||
 | 
			
		||||
install-dev:
 | 
			
		||||
	$(PIP) install -e .
 | 
			
		||||
 | 
			
		||||
test:
 | 
			
		||||
	pytest $(PYTEST_ARGS)
 | 
			
		||||
 | 
			
		||||
docker:
 | 
			
		||||
	docker build \
 | 
			
		||||
	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,13 @@
 | 
			
		|||
{
 | 
			
		||||
  "version": "0.9.7",
 | 
			
		||||
  "git_url": "https://github.com/qurator-spk/dinglehopper",
 | 
			
		||||
  "dockerhub": "ocrd/dinglehopper",
 | 
			
		||||
  "tools": {
 | 
			
		||||
    "ocrd-dinglehopper": {
 | 
			
		||||
      "executable": "ocrd-dinglehopper",
 | 
			
		||||
      "input_file_grp_cardinality": 2,
 | 
			
		||||
      "output_file_grp_cardinality": 1,
 | 
			
		||||
      "description": "Evaluate OCR text against ground truth with dinglehopper",
 | 
			
		||||
      "input_file_grp": [
 | 
			
		||||
        "OCR-D-GT-PAGE",
 | 
			
		||||
        "OCR-D-OCR"
 | 
			
		||||
      ],
 | 
			
		||||
      "output_file_grp": [
 | 
			
		||||
        "OCR-D-OCR-EVAL"
 | 
			
		||||
      ],
 | 
			
		||||
      "categories": [
 | 
			
		||||
        "Quality assurance"
 | 
			
		||||
      ],
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,54 +1,44 @@
 | 
			
		|||
import json
 | 
			
		||||
from functools import cached_property
 | 
			
		||||
import os
 | 
			
		||||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
import click
 | 
			
		||||
import importlib_resources
 | 
			
		||||
from ocrd_models import OcrdFileType
 | 
			
		||||
from ocrd import Processor
 | 
			
		||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 | 
			
		||||
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
 | 
			
		||||
from ocrd_utils import make_file_id
 | 
			
		||||
 | 
			
		||||
from .cli import process as cli_process
 | 
			
		||||
 | 
			
		||||
OCRD_TOOL = json.loads(
 | 
			
		||||
    importlib_resources.files(__name__)
 | 
			
		||||
    .joinpath("ocrd-tool.json")
 | 
			
		||||
    .read_text(encoding="utf-8", errors="strict")
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@click.command()
 | 
			
		||||
@ocrd_cli_options
 | 
			
		||||
def ocrd_dinglehopper(*args, **kwargs):
 | 
			
		||||
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class OcrdDinglehopperEvaluate(Processor):
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
 | 
			
		||||
        kwargs["version"] = OCRD_TOOL["version"]
 | 
			
		||||
        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
    def process(self):
 | 
			
		||||
        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
 | 
			
		||||
        assert_file_grp_cardinality(self.output_file_grp, 1)
 | 
			
		||||
    @cached_property
 | 
			
		||||
    def executable(self):
 | 
			
		||||
        return 'ocrd-dinglehopper'
 | 
			
		||||
 | 
			
		||||
        log = getLogger("processor.OcrdDinglehopperEvaluate")
 | 
			
		||||
    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
 | 
			
		||||
 | 
			
		||||
        assert self.parameter
 | 
			
		||||
        metrics = self.parameter["metrics"]
 | 
			
		||||
        textequiv_level = self.parameter["textequiv_level"]
 | 
			
		||||
        gt_grp, ocr_grp = self.input_file_grp.split(",")
 | 
			
		||||
 | 
			
		||||
        input_file_tuples = self.zip_input_files(on_error="abort")
 | 
			
		||||
        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
 | 
			
		||||
            if not gt_file or not ocr_file:
 | 
			
		||||
                # file/page was not found in this group
 | 
			
		||||
                continue
 | 
			
		||||
            gt_file = self.workspace.download_file(gt_file)
 | 
			
		||||
            ocr_file = self.workspace.download_file(ocr_file)
 | 
			
		||||
        try:
 | 
			
		||||
            gt_file, ocr_file = input_files
 | 
			
		||||
            assert gt_file, 'missing GT file'
 | 
			
		||||
            assert ocr_file, 'missing OCR file'
 | 
			
		||||
            assert gt_file.local_filename
 | 
			
		||||
            assert ocr_file.local_filename
 | 
			
		||||
        except (ValueError, AssertionError) as err:
 | 
			
		||||
            self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        page_id = gt_file.pageId
 | 
			
		||||
 | 
			
		||||
            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
 | 
			
		||||
 | 
			
		||||
        file_id = make_file_id(ocr_file, self.output_file_grp)
 | 
			
		||||
        report_prefix = os.path.join(self.output_file_grp, file_id)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue