From 63031b30bff9a7dc0033e8da4f3dd646e3e93949 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Apr 2025 17:25:17 +0200 Subject: [PATCH 01/10] Port to OCR-D/core API v3 --- .dockerignore | 5 ++ Dockerfile | 33 +++++++--- Makefile | 9 ++- src/dinglehopper/ocrd-tool.json | 10 +-- src/dinglehopper/ocrd_cli.py | 106 +++++++++++++++----------------- 5 files changed, 87 insertions(+), 76 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a8312db --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +src/dinglehopper/tests +dist +build +*.egg-info +.git diff --git a/Dockerfile b/Dockerfile index 04e7330..d4b2b76 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,17 +6,30 @@ LABEL \ maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="dinglehopper" \ + org.opencontainers.image.description="The OCR evaluation tool" \ + org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ + org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/dinglehopper -COPY pyproject.toml . -COPY src/dinglehopper/ocrd-tool.json . -COPY src ./src -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN make install -RUN rm -rf /build/dinglehopper +COPY . . +RUN make install && rm -rf /build/dinglehopper WORKDIR /data -VOLUME ["/data"] +VOLUME /data diff --git a/Makefile b/Makefile index babaf5f..2a4b13c 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 +PYTEST_ARGS = -vv -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/dinglehopper help: @@ -16,6 +17,12 @@ help: install: $(PIP) install . +install-dev: + $(PIP) install -e . + +test: + pytest $(PYTEST_ARGS) + docker: docker build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index f4572c7..00d5d2b 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,17 +1,13 @@ { "version": "0.9.7", "git_url": "https://github.com/qurator-spk/dinglehopper", + "dockerhub": "ocrd/dinglehopper", "tools": { "ocrd-dinglehopper": { "executable": "ocrd-dinglehopper", + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "description": "Evaluate OCR text against ground truth with dinglehopper", - "input_file_grp": [ - "OCR-D-GT-PAGE", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-OCR-EVAL" - ], "categories": [ "Quality assurance" ], diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 4da4960..9696ff9 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,83 +1,73 @@ -import json +from functools import cached_property import os +from typing import Optional import click -import importlib_resources +from ocrd_models import OcrdFileType from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id +from ocrd_utils import make_file_id from .cli import process as cli_process -OCRD_TOOL = json.loads( - importlib_resources.files(__name__) - .joinpath("ocrd-tool.json") - .read_text(encoding="utf-8", errors="strict") -) - - @click.command() @ocrd_cli_options def ocrd_dinglehopper(*args, **kwargs): return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) - class OcrdDinglehopperEvaluate(Processor): - def __init__(self, *args, **kwargs): - kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] - kwargs["version"] = OCRD_TOOL["version"] - super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) - def process(self): - assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") - assert_file_grp_cardinality(self.output_file_grp, 1) + @cached_property + def executable(self): + return 'ocrd-dinglehopper' - log = getLogger("processor.OcrdDinglehopperEvaluate") + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - gt_grp, ocr_grp = self.input_file_grp.split(",") - - input_file_tuples = self.zip_input_files(on_error="abort") - for n, (gt_file, ocr_file) in enumerate(input_file_tuples): - if not gt_file or not ocr_file: - # file/page was not found in this group - continue - gt_file = self.workspace.download_file(gt_file) - ocr_file = self.workspace.download_file(ocr_file) - page_id = gt_file.pageId - log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) - - file_id = make_file_id(ocr_file, self.output_file_grp) - report_prefix = os.path.join(self.output_file_grp, file_id) - - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass - cli_process( - gt_file.local_filename, - ocr_file.local_filename, - report_prefix, - metrics=metrics, - textequiv_level=textequiv_level, + try: + gt_file, ocr_file = input_files + assert gt_file, 'missing GT file' + assert ocr_file, 'missing OCR file' + assert gt_file.local_filename + assert ocr_file.local_filename + except (ValueError, AssertionError) as err: + self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? + return + + page_id = gt_file.pageId + + file_id = make_file_id(ocr_file, self.output_file_grp) + report_prefix = os.path.join(self.output_file_grp, file_id) + + # Process the files + try: + os.mkdir(self.output_file_grp) + except FileExistsError: + pass + cli_process( + gt_file.local_filename, + ocr_file.local_filename, + report_prefix, + metrics=metrics, + textequiv_level=textequiv_level, + ) + + # Add reports to the workspace + for report_suffix, mimetype in [ + [".html", "text/html"], + [".json", "application/json"], + ]: + self.workspace.add_file( + file_id=file_id + report_suffix, + file_grp=self.output_file_grp, + page_id=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix, ) - # Add reports to the workspace - for report_suffix, mimetype in [ - [".html", "text/html"], - [".json", "application/json"], - ]: - self.workspace.add_file( - file_id=file_id + report_suffix, - file_grp=self.output_file_grp, - page_id=page_id, - mimetype=mimetype, - local_filename=report_prefix + report_suffix, - ) - if __name__ == "__main__": ocrd_dinglehopper() From f287386c0e8b315a077e2400965b56c1e9759cc4 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 14:49:23 +0200 Subject: [PATCH 02/10] =?UTF-8?q?=F0=9F=A7=B9Don't=20pin=20uniseg=20and=20?= =?UTF-8?q?rapidfuzz?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Breakage with the newest uniseg API was fixed in master. Can't see any issue with rapidfuzz, so removing that pin, too. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0b3d819..123187b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ uniseg >= 0.9.1 numpy colorama MarkupSafe -ocrd >= 2.65.0 +ocrd >= 3.3.0 attrs multimethod >= 1.3 tqdm From 8c1b6d65f57f1fba9c7e71980cb97934460b7073 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Apr 2025 17:49:53 +0200 Subject: [PATCH 03/10] Dockerfile: build ocrd-all-tool.json --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index d4b2b76..75dfcdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,9 @@ ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/dinglehopper COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json RUN make install && rm -rf /build/dinglehopper WORKDIR /data From c0aa82d18885402ddc0093dfc75a07e0c23a0e5b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 14:00:05 +0200 Subject: [PATCH 04/10] OCR-D processor: properly handle missing or non-downloaded GT/OCR file Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 9696ff9..52da817 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -27,14 +27,19 @@ class OcrdDinglehopperEvaluate(Processor): metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - try: - gt_file, ocr_file = input_files - assert gt_file, 'missing GT file' - assert ocr_file, 'missing OCR file' - assert gt_file.local_filename - assert ocr_file.local_filename - except (ValueError, AssertionError) as err: - self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? + # wrong number of inputs: let fail + gt_file, ocr_file = input_files + # missing on either side: skip (zip_input_files already warned) + if not gt_file or not ocr_file: + return + # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): + if not gt_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) + return + if not ocr_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) return page_id = gt_file.pageId From 4162836612661a0232ff8783af56c65561df8c48 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 18:54:58 +0200 Subject: [PATCH 05/10] ocrd_cli: no need to check fileGrp dir exists Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 52da817..90db7d1 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -45,17 +45,11 @@ class OcrdDinglehopperEvaluate(Processor): page_id = gt_file.pageId file_id = make_file_id(ocr_file, self.output_file_grp) - report_prefix = os.path.join(self.output_file_grp, file_id) - - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass cli_process( gt_file.local_filename, ocr_file.local_filename, - report_prefix, + file_id, + self.output_file_grp, metrics=metrics, textequiv_level=textequiv_level, ) From f6a2c94520dcf79892278320b29e3906d4a5f4bb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 18:55:42 +0200 Subject: [PATCH 06/10] ocrd_cli: but do check for existing output files Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 90db7d1..dbf59be 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -59,8 +59,12 @@ class OcrdDinglehopperEvaluate(Processor): [".html", "text/html"], [".json", "application/json"], ]: + output_file_id = file_id + report_suffix + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set") self.workspace.add_file( - file_id=file_id + report_suffix, + file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, mimetype=mimetype, From 831a24fc4ca606cc04bd37a8217a52654e67d3f4 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 16 Apr 2025 19:03:13 +0200 Subject: [PATCH 07/10] typo: report_prefix -> file_id --- src/dinglehopper/ocrd_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index dbf59be..fa4747f 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -68,7 +68,7 @@ class OcrdDinglehopperEvaluate(Processor): file_grp=self.output_file_grp, page_id=page_id, mimetype=mimetype, - local_filename=report_prefix + report_suffix, + local_filename=file_id + report_suffix, ) From b7bdca4ac88a57660814aa83848ff1b2f86fecd6 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:09:06 +0200 Subject: [PATCH 08/10] =?UTF-8?q?=F0=9F=90=9B=20Makefile:=20Make=20phony?= =?UTF-8?q?=20targets=20.PHONY?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2a4b13c..12f342a 100644 --- a/Makefile +++ b/Makefile @@ -30,4 +30,4 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -.PHONY: help install docker +.PHONY: help install install-dev test docker From d974369e13e3bf5f20e24084a27b912430717150 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:10:56 +0200 Subject: [PATCH 09/10] =?UTF-8?q?=F0=9F=90=9B=20Docker:=20Fix=20descriptio?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 75dfcdd..f942d78 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ LABEL \ org.label-schema.build-date=$BUILD_DATE \ org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ org.opencontainers.image.title="dinglehopper" \ - org.opencontainers.image.description="The OCR evaluation tool" \ + org.opencontainers.image.description="An OCR evaluation tool" \ org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ org.opencontainers.image.revision=$VCS_REF \ From 13ab1ae150481b915c856700c6b0348fb4ba6884 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:26:36 +0200 Subject: [PATCH 10/10] =?UTF-8?q?=F0=9F=90=9B=20Docker:=20Use=20same=20ven?= =?UTF-8?q?dor=20as=20license=20for=20now?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f942d78..e497d16 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ LABEL \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ org.label-schema.build-date=$BUILD_DATE \ - org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.vendor="qurator" \ org.opencontainers.image.title="dinglehopper" \ org.opencontainers.image.description="An OCR evaluation tool" \ org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \