mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-11-04 03:14:16 +01:00 
			
		
		
		
	
						commit
						b1c109baae
					
				
					 6 changed files with 92 additions and 75 deletions
				
			
		
							
								
								
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,5 @@
 | 
			
		|||
src/dinglehopper/tests
 | 
			
		||||
dist
 | 
			
		||||
build
 | 
			
		||||
*.egg-info
 | 
			
		||||
.git
 | 
			
		||||
							
								
								
									
										36
									
								
								Dockerfile
									
										
									
									
									
								
							
							
						
						
									
										36
									
								
								Dockerfile
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -6,17 +6,33 @@ LABEL \
 | 
			
		|||
    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
 | 
			
		||||
    org.label-schema.vcs-ref=$VCS_REF \
 | 
			
		||||
    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
 | 
			
		||||
    org.label-schema.build-date=$BUILD_DATE
 | 
			
		||||
    org.label-schema.build-date=$BUILD_DATE \
 | 
			
		||||
    org.opencontainers.image.vendor="qurator" \
 | 
			
		||||
    org.opencontainers.image.title="dinglehopper" \
 | 
			
		||||
    org.opencontainers.image.description="An OCR evaluation tool" \
 | 
			
		||||
    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
 | 
			
		||||
    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
 | 
			
		||||
    org.opencontainers.image.revision=$VCS_REF \
 | 
			
		||||
    org.opencontainers.image.created=$BUILD_DATE \
 | 
			
		||||
    org.opencontainers.image.base.name=ocrd/core
 | 
			
		||||
 | 
			
		||||
ENV LANG=C.UTF-8
 | 
			
		||||
ENV LC_ALL=C.UTF-8
 | 
			
		||||
 | 
			
		||||
# avoid HOME/.local/share (hard to predict USER here)
 | 
			
		||||
# so let XDG_DATA_HOME coincide with fixed system location
 | 
			
		||||
# (can still be overridden by derived stages)
 | 
			
		||||
ENV XDG_DATA_HOME /usr/local/share
 | 
			
		||||
# avoid the need for an extra volume for persistent resource user db
 | 
			
		||||
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
 | 
			
		||||
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 | 
			
		||||
 | 
			
		||||
WORKDIR /build/dinglehopper
 | 
			
		||||
COPY pyproject.toml .
 | 
			
		||||
COPY src/dinglehopper/ocrd-tool.json .
 | 
			
		||||
COPY src ./src
 | 
			
		||||
COPY requirements.txt .
 | 
			
		||||
COPY README.md .
 | 
			
		||||
COPY Makefile .
 | 
			
		||||
RUN make install
 | 
			
		||||
RUN rm -rf /build/dinglehopper
 | 
			
		||||
COPY . .
 | 
			
		||||
COPY ocrd-tool.json .
 | 
			
		||||
# prepackage ocrd-tool.json as ocrd-all-tool.json
 | 
			
		||||
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
 | 
			
		||||
RUN make install && rm -rf /build/dinglehopper
 | 
			
		||||
 | 
			
		||||
WORKDIR /data
 | 
			
		||||
VOLUME ["/data"]
 | 
			
		||||
VOLUME /data
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										11
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										11
									
								
								Makefile
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,8 +1,9 @@
 | 
			
		|||
PYTHON = python3
 | 
			
		||||
PIP = pip3
 | 
			
		||||
PYTHONIOENCODING=utf8
 | 
			
		||||
PYTEST_ARGS = -vv
 | 
			
		||||
 | 
			
		||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
 | 
			
		||||
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 | 
			
		||||
DOCKER_TAG = ocrd/dinglehopper
 | 
			
		||||
 | 
			
		||||
help:
 | 
			
		||||
| 
						 | 
				
			
			@ -16,6 +17,12 @@ help:
 | 
			
		|||
install:
 | 
			
		||||
	$(PIP) install .
 | 
			
		||||
 | 
			
		||||
install-dev:
 | 
			
		||||
	$(PIP) install -e .
 | 
			
		||||
 | 
			
		||||
test:
 | 
			
		||||
	pytest $(PYTEST_ARGS)
 | 
			
		||||
 | 
			
		||||
docker:
 | 
			
		||||
	docker build \
 | 
			
		||||
	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | 
			
		||||
| 
						 | 
				
			
			@ -23,4 +30,4 @@ docker:
 | 
			
		|||
	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 | 
			
		||||
	-t $(DOCKER_TAG) .
 | 
			
		||||
 | 
			
		||||
.PHONY: help install docker
 | 
			
		||||
.PHONY: help install install-dev test docker
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,7 +5,7 @@ uniseg >= 0.9.1
 | 
			
		|||
numpy
 | 
			
		||||
colorama
 | 
			
		||||
MarkupSafe
 | 
			
		||||
ocrd >= 2.65.0
 | 
			
		||||
ocrd >= 3.3.0
 | 
			
		||||
attrs
 | 
			
		||||
multimethod >= 1.3
 | 
			
		||||
tqdm
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,13 @@
 | 
			
		|||
{
 | 
			
		||||
  "version": "0.9.7",
 | 
			
		||||
  "git_url": "https://github.com/qurator-spk/dinglehopper",
 | 
			
		||||
  "dockerhub": "ocrd/dinglehopper",
 | 
			
		||||
  "tools": {
 | 
			
		||||
    "ocrd-dinglehopper": {
 | 
			
		||||
      "executable": "ocrd-dinglehopper",
 | 
			
		||||
      "input_file_grp_cardinality": 2,
 | 
			
		||||
      "output_file_grp_cardinality": 1,
 | 
			
		||||
      "description": "Evaluate OCR text against ground truth with dinglehopper",
 | 
			
		||||
      "input_file_grp": [
 | 
			
		||||
        "OCR-D-GT-PAGE",
 | 
			
		||||
        "OCR-D-OCR"
 | 
			
		||||
      ],
 | 
			
		||||
      "output_file_grp": [
 | 
			
		||||
        "OCR-D-OCR-EVAL"
 | 
			
		||||
      ],
 | 
			
		||||
      "categories": [
 | 
			
		||||
        "Quality assurance"
 | 
			
		||||
      ],
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,83 +1,76 @@
 | 
			
		|||
import json
 | 
			
		||||
from functools import cached_property
 | 
			
		||||
import os
 | 
			
		||||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
import click
 | 
			
		||||
import importlib_resources
 | 
			
		||||
from ocrd_models import OcrdFileType
 | 
			
		||||
from ocrd import Processor
 | 
			
		||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 | 
			
		||||
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
 | 
			
		||||
from ocrd_utils import make_file_id
 | 
			
		||||
 | 
			
		||||
from .cli import process as cli_process
 | 
			
		||||
 | 
			
		||||
OCRD_TOOL = json.loads(
 | 
			
		||||
    importlib_resources.files(__name__)
 | 
			
		||||
    .joinpath("ocrd-tool.json")
 | 
			
		||||
    .read_text(encoding="utf-8", errors="strict")
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@click.command()
 | 
			
		||||
@ocrd_cli_options
 | 
			
		||||
def ocrd_dinglehopper(*args, **kwargs):
 | 
			
		||||
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class OcrdDinglehopperEvaluate(Processor):
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
 | 
			
		||||
        kwargs["version"] = OCRD_TOOL["version"]
 | 
			
		||||
        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
    def process(self):
 | 
			
		||||
        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
 | 
			
		||||
        assert_file_grp_cardinality(self.output_file_grp, 1)
 | 
			
		||||
    @cached_property
 | 
			
		||||
    def executable(self):
 | 
			
		||||
        return 'ocrd-dinglehopper'
 | 
			
		||||
 | 
			
		||||
        log = getLogger("processor.OcrdDinglehopperEvaluate")
 | 
			
		||||
    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
 | 
			
		||||
 | 
			
		||||
        assert self.parameter
 | 
			
		||||
        metrics = self.parameter["metrics"]
 | 
			
		||||
        textequiv_level = self.parameter["textequiv_level"]
 | 
			
		||||
        gt_grp, ocr_grp = self.input_file_grp.split(",")
 | 
			
		||||
 | 
			
		||||
        input_file_tuples = self.zip_input_files(on_error="abort")
 | 
			
		||||
        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
 | 
			
		||||
            if not gt_file or not ocr_file:
 | 
			
		||||
                # file/page was not found in this group
 | 
			
		||||
                continue
 | 
			
		||||
            gt_file = self.workspace.download_file(gt_file)
 | 
			
		||||
            ocr_file = self.workspace.download_file(ocr_file)
 | 
			
		||||
            page_id = gt_file.pageId
 | 
			
		||||
        # wrong number of inputs: let fail
 | 
			
		||||
        gt_file, ocr_file = input_files
 | 
			
		||||
        # missing on either side: skip (zip_input_files already warned)
 | 
			
		||||
        if not gt_file or not ocr_file:
 | 
			
		||||
            return
 | 
			
		||||
        # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
 | 
			
		||||
        if not gt_file.local_filename:
 | 
			
		||||
            if config.OCRD_MISSING_INPUT == 'ABORT':
 | 
			
		||||
                raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
 | 
			
		||||
            return
 | 
			
		||||
        if not ocr_file.local_filename:
 | 
			
		||||
            if config.OCRD_MISSING_INPUT == 'ABORT':
 | 
			
		||||
                raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
 | 
			
		||||
        page_id = gt_file.pageId
 | 
			
		||||
 | 
			
		||||
            file_id = make_file_id(ocr_file, self.output_file_grp)
 | 
			
		||||
            report_prefix = os.path.join(self.output_file_grp, file_id)
 | 
			
		||||
        file_id = make_file_id(ocr_file, self.output_file_grp)
 | 
			
		||||
        cli_process(
 | 
			
		||||
            gt_file.local_filename,
 | 
			
		||||
            ocr_file.local_filename,
 | 
			
		||||
            file_id,
 | 
			
		||||
            self.output_file_grp,
 | 
			
		||||
            metrics=metrics,
 | 
			
		||||
            textequiv_level=textequiv_level,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
            # Process the files
 | 
			
		||||
            try:
 | 
			
		||||
                os.mkdir(self.output_file_grp)
 | 
			
		||||
            except FileExistsError:
 | 
			
		||||
                pass
 | 
			
		||||
            cli_process(
 | 
			
		||||
                gt_file.local_filename,
 | 
			
		||||
                ocr_file.local_filename,
 | 
			
		||||
                report_prefix,
 | 
			
		||||
                metrics=metrics,
 | 
			
		||||
                textequiv_level=textequiv_level,
 | 
			
		||||
        # Add reports to the workspace
 | 
			
		||||
        for report_suffix, mimetype in [
 | 
			
		||||
            [".html", "text/html"],
 | 
			
		||||
            [".json", "application/json"],
 | 
			
		||||
        ]:
 | 
			
		||||
            output_file_id = file_id + report_suffix
 | 
			
		||||
            output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
 | 
			
		||||
            if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
 | 
			
		||||
                raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
 | 
			
		||||
            self.workspace.add_file(
 | 
			
		||||
               file_id=output_file_id,
 | 
			
		||||
                file_grp=self.output_file_grp,
 | 
			
		||||
                page_id=page_id,
 | 
			
		||||
                mimetype=mimetype,
 | 
			
		||||
                local_filename=file_id + report_suffix,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # Add reports to the workspace
 | 
			
		||||
            for report_suffix, mimetype in [
 | 
			
		||||
                [".html", "text/html"],
 | 
			
		||||
                [".json", "application/json"],
 | 
			
		||||
            ]:
 | 
			
		||||
                self.workspace.add_file(
 | 
			
		||||
                    file_id=file_id + report_suffix,
 | 
			
		||||
                    file_grp=self.output_file_grp,
 | 
			
		||||
                    page_id=page_id,
 | 
			
		||||
                    mimetype=mimetype,
 | 
			
		||||
                    local_filename=report_prefix + report_suffix,
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    ocrd_dinglehopper()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue