diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a8312db --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +src/dinglehopper/tests +dist +build +*.egg-info +.git diff --git a/Dockerfile b/Dockerfile index 04e7330..e497d16 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,17 +6,33 @@ LABEL \ maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="qurator" \ + org.opencontainers.image.title="dinglehopper" \ + org.opencontainers.image.description="An OCR evaluation tool" \ + org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ + org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/dinglehopper -COPY pyproject.toml . -COPY src/dinglehopper/ocrd-tool.json . -COPY src ./src -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN make install -RUN rm -rf /build/dinglehopper +COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +RUN make install && rm -rf /build/dinglehopper WORKDIR /data -VOLUME ["/data"] +VOLUME /data diff --git a/Makefile b/Makefile index babaf5f..12f342a 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 +PYTEST_ARGS = -vv -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/dinglehopper help: @@ -16,6 +17,12 @@ help: install: $(PIP) install . +install-dev: + $(PIP) install -e . + +test: + pytest $(PYTEST_ARGS) + docker: docker build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ @@ -23,4 +30,4 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -.PHONY: help install docker +.PHONY: help install install-dev test docker diff --git a/requirements.txt b/requirements.txt index 0b3d819..123187b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ uniseg >= 0.9.1 numpy colorama MarkupSafe -ocrd >= 2.65.0 +ocrd >= 3.3.0 attrs multimethod >= 1.3 tqdm diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index f4572c7..00d5d2b 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,17 +1,13 @@ { "version": "0.9.7", "git_url": "https://github.com/qurator-spk/dinglehopper", + "dockerhub": "ocrd/dinglehopper", "tools": { "ocrd-dinglehopper": { "executable": "ocrd-dinglehopper", + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "description": "Evaluate OCR text against ground truth with dinglehopper", - "input_file_grp": [ - "OCR-D-GT-PAGE", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-OCR-EVAL" - ], "categories": [ "Quality assurance" ], diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 4da4960..fa4747f 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,83 +1,76 @@ -import json +from functools import cached_property import os +from typing import Optional import click -import importlib_resources +from ocrd_models import OcrdFileType from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id +from ocrd_utils import make_file_id from .cli import process as cli_process -OCRD_TOOL = json.loads( - importlib_resources.files(__name__) - .joinpath("ocrd-tool.json") - .read_text(encoding="utf-8", errors="strict") -) - - @click.command() @ocrd_cli_options def ocrd_dinglehopper(*args, **kwargs): return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) - class OcrdDinglehopperEvaluate(Processor): - def __init__(self, *args, **kwargs): - kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] - kwargs["version"] = OCRD_TOOL["version"] - super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) - def process(self): - assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") - assert_file_grp_cardinality(self.output_file_grp, 1) + @cached_property + def executable(self): + return 'ocrd-dinglehopper' - log = getLogger("processor.OcrdDinglehopperEvaluate") + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - gt_grp, ocr_grp = self.input_file_grp.split(",") - - input_file_tuples = self.zip_input_files(on_error="abort") - for n, (gt_file, ocr_file) in enumerate(input_file_tuples): - if not gt_file or not ocr_file: - # file/page was not found in this group - continue - gt_file = self.workspace.download_file(gt_file) - ocr_file = self.workspace.download_file(ocr_file) - page_id = gt_file.pageId - log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) - - file_id = make_file_id(ocr_file, self.output_file_grp) - report_prefix = os.path.join(self.output_file_grp, file_id) - - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass - cli_process( - gt_file.local_filename, - ocr_file.local_filename, - report_prefix, - metrics=metrics, - textequiv_level=textequiv_level, + # wrong number of inputs: let fail + gt_file, ocr_file = input_files + # missing on either side: skip (zip_input_files already warned) + if not gt_file or not ocr_file: + return + # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): + if not gt_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) + return + if not ocr_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) + return + + page_id = gt_file.pageId + + file_id = make_file_id(ocr_file, self.output_file_grp) + cli_process( + gt_file.local_filename, + ocr_file.local_filename, + file_id, + self.output_file_grp, + metrics=metrics, + textequiv_level=textequiv_level, + ) + + # Add reports to the workspace + for report_suffix, mimetype in [ + [".html", "text/html"], + [".json", "application/json"], + ]: + output_file_id = file_id + report_suffix + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set") + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=page_id, + mimetype=mimetype, + local_filename=file_id + report_suffix, ) - # Add reports to the workspace - for report_suffix, mimetype in [ - [".html", "text/html"], - [".json", "application/json"], - ]: - self.workspace.add_file( - file_id=file_id + report_suffix, - file_grp=self.output_file_grp, - page_id=page_id, - mimetype=mimetype, - local_filename=report_prefix + report_suffix, - ) - if __name__ == "__main__": ocrd_dinglehopper()