Merge pull request #128 from kba/v3-api

V3 api
2025-11-04 11:24:17 +01:00 · 2025-04-17 08:34:51 +02:00 · 2025-04-17 08:34:51 +02:00 · b1c109baae
commit b1c109baae
parent bf6633be02 13ab1ae150
6 changed files with 92 additions and 75 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,5 @@
 src/dinglehopper/tests
 dist
 build
 *.egg-info
 .git
--- a/36
+++ b/36
@ -6,17 +6,33 @@ LABEL \
    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
    org.label-schema.vcs-ref=$VCS_REF \
    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
-    org.label-schema.build-date=$BUILD_DATE
+    org.label-schema.build-date=$BUILD_DATE \
    org.opencontainers.image.vendor="qurator" \
    org.opencontainers.image.title="dinglehopper" \
    org.opencontainers.image.description="An OCR evaluation tool" \
    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
    org.opencontainers.image.revision=$VCS_REF \
    org.opencontainers.image.created=$BUILD_DATE \
    org.opencontainers.image.base.name=ocrd/core
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
 # avoid HOME/.local/share (hard to predict USER here)
 # so let XDG_DATA_HOME coincide with fixed system location
 # (can still be overridden by derived stages)
 ENV XDG_DATA_HOME /usr/local/share
 # avoid the need for an extra volume for persistent resource user db
 # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
 ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 WORKDIR /build/dinglehopper
-COPY pyproject.toml .
+COPY . .
-COPY src/dinglehopper/ocrd-tool.json .
+COPY ocrd-tool.json .
-COPY src ./src
+# prepackage ocrd-tool.json as ocrd-all-tool.json
-COPY requirements.txt .
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
-COPY README.md .
+RUN make install && rm -rf /build/dinglehopper
 COPY Makefile .
 RUN make install
 RUN rm -rf /build/dinglehopper
 WORKDIR /data
-VOLUME ["/data"]
+VOLUME /data
--- a/11
+++ b/11
@ -1,8 +1,9 @@
 PYTHON = python3
 PIP = pip3
 PYTHONIOENCODING=utf8
 PYTEST_ARGS = -vv
-DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 DOCKER_TAG = ocrd/dinglehopper
 help:
@ -16,6 +17,12 @@ help:
 install:
 	$(PIP) install .
 install-dev:
 	$(PIP) install -e .
 test:
 	pytest $(PYTEST_ARGS)
 docker:
 	docker build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
@ -23,4 +30,4 @@ docker:
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 	-t $(DOCKER_TAG) .
-.PHONY: help install docker
+.PHONY: help install install-dev test docker
--- a/requirements.txt
+++ b/requirements.txt
@ -5,7 +5,7 @@ uniseg >= 0.9.1
 numpy
 colorama
 MarkupSafe
-ocrd >= 2.65.0
+ocrd >= 3.3.0
 attrs
 multimethod >= 1.3
 tqdm
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@ -1,17 +1,13 @@
 {
  "version": "0.9.7",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
  "dockerhub": "ocrd/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
      "executable": "ocrd-dinglehopper",
      "input_file_grp_cardinality": 2,
      "output_file_grp_cardinality": 1,
      "description": "Evaluate OCR text against ground truth with dinglehopper",
      "input_file_grp": [
        "OCR-D-GT-PAGE",
        "OCR-D-OCR"
      ],
      "output_file_grp": [
        "OCR-D-OCR-EVAL"
      ],
      "categories": [
        "Quality assurance"
      ],
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@ -1,83 +1,76 @@
-import json
+from functools import cached_property
 import os
 from typing import Optional
 import click
-import importlib_resources
+from ocrd_models import OcrdFileType
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
+from ocrd_utils import make_file_id
 from .cli import process as cli_process
 OCRD_TOOL = json.loads(
    importlib_resources.files(__name__)
    .joinpath("ocrd-tool.json")
    .read_text(encoding="utf-8", errors="strict")
 )
@click.command()
@ocrd_cli_options
 def ocrd_dinglehopper(*args, **kwargs):
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 class OcrdDinglehopperEvaluate(Processor):
    def __init__(self, *args, **kwargs):
        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
        kwargs["version"] = OCRD_TOOL["version"]
        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
-    def process(self):
+    @cached_property
-        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
+    def executable(self):
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+        return 'ocrd-dinglehopper'
-        log = getLogger("processor.OcrdDinglehopperEvaluate")
+    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
        assert self.parameter
        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
        gt_grp, ocr_grp = self.input_file_grp.split(",")
-        input_file_tuples = self.zip_input_files(on_error="abort")
+        # wrong number of inputs: let fail
-        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
+        gt_file, ocr_file = input_files
-            if not gt_file or not ocr_file:
+        # missing on either side: skip (zip_input_files already warned)
-                # file/page was not found in this group
+        if not gt_file or not ocr_file:
-                continue
+            return
-            gt_file = self.workspace.download_file(gt_file)
+        # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
-            ocr_file = self.workspace.download_file(ocr_file)
+        if not gt_file.local_filename:
-            page_id = gt_file.pageId
+            if config.OCRD_MISSING_INPUT == 'ABORT':
                raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
            return
        if not ocr_file.local_filename:
            if config.OCRD_MISSING_INPUT == 'ABORT':
                raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
            return
-            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
+        page_id = gt_file.pageId
-            file_id = make_file_id(ocr_file, self.output_file_grp)
+        file_id = make_file_id(ocr_file, self.output_file_grp)
-            report_prefix = os.path.join(self.output_file_grp, file_id)
+        cli_process(
            gt_file.local_filename,
            ocr_file.local_filename,
            file_id,
            self.output_file_grp,
            metrics=metrics,
            textequiv_level=textequiv_level,
        )
-            # Process the files
+        # Add reports to the workspace
-            try:
+        for report_suffix, mimetype in [
-                os.mkdir(self.output_file_grp)
+            [".html", "text/html"],
-            except FileExistsError:
+            [".json", "application/json"],
-                pass
+        ]:
-            cli_process(
+            output_file_id = file_id + report_suffix
-                gt_file.local_filename,
+            output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
-                ocr_file.local_filename,
+            if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
-                report_prefix,
+                raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
-                metrics=metrics,
+            self.workspace.add_file(
-                textequiv_level=textequiv_level,
+               file_id=output_file_id,
                file_grp=self.output_file_grp,
                page_id=page_id,
                mimetype=mimetype,
                local_filename=file_id + report_suffix,
            )
            # Add reports to the workspace
            for report_suffix, mimetype in [
                [".html", "text/html"],
                [".json", "application/json"],
            ]:
                self.workspace.add_file(
                    file_id=file_id + report_suffix,
                    file_grp=self.output_file_grp,
                    page_id=page_id,
                    mimetype=mimetype,
                    local_filename=report_prefix + report_suffix,
                )
 if __name__ == "__main__":
    ocrd_dinglehopper()