From b336f98271036830dcf5d2456ffa8b87752e9c16 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 6 May 2024 18:14:16 +0200 Subject: [PATCH 01/67] =?UTF-8?q?=F0=9F=90=9B=20Fix=20reading=20plain=20te?= =?UTF-8?q?xt=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107. --- src/dinglehopper/ocr_files.py | 4 ++-- src/dinglehopper/tests/test_ocr_files.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 0c4fa04..1593f44 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]: for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): line_id = line.attrib.get("ID") line_text = " ".join( - string.attrib.get("CONTENT") + string.attrib.get("CONTENT", "") for string in line.iterfind("alto:String", namespaces=nsmap) ) normalized_text = normalize_sbb(line_text) @@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False): with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, - [make_segment(no, line) for no, line in enumerate(f.readlines())], + [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], "\n", None, None, diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 4790c85..342507a 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -177,8 +177,8 @@ def test_text(): def test_plain(tmp_path): with working_directory(tmp_path): with open("ocr.txt", "w") as ocrf: - ocrf.write("AAAAB") + ocrf.write("First, a line.\nAnd a second line.\n") result = plain_text("ocr.txt") - expected = "AAAAB" + expected = "First, a line.\nAnd a second line." assert result == expected From a534b5e28e4317b150bf43d5bf3ef2d314afff90 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 13 May 2024 21:16:29 +0200 Subject: [PATCH 02/67] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 640db3b..2a2cf1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.4 hooks: - args: - --fix From c91234daba29744586c6eab17575b793560d95f4 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 13 May 2024 21:17:42 +0200 Subject: [PATCH 03/67] =?UTF-8?q?=E2=9C=94=20GitHub=20Actions:=20Update=20?= =?UTF-8?q?used=20actions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/release.yml | 8 ++++---- .github/workflows/test.yml | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8c193df..3f51bd7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Upgrade pip run: python3 -m pip install --upgrade pip - name: Install setuptools @@ -32,7 +32,7 @@ jobs: - name: Build package run: python3 -m pip install --upgrade build && python3 -m build - name: Upload dist - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: dist path: dist/ @@ -42,7 +42,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download dist - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: dist path: dist/ @@ -61,7 +61,7 @@ jobs: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - name: Download dist - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: dist path: dist/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f049c2c..f40c830 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,12 +31,12 @@ jobs: steps: - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install possible lxml build requirements (if building from source) run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev @@ -56,7 +56,7 @@ jobs: cd src python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy - name: Upload test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: success() || failure() with: name: test-results-${{matrix.python-version}} From bc5818da9f9d0ae44fcc7580ed458eb8a900be89 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 14 May 2024 15:56:08 +0200 Subject: [PATCH 04/67] =?UTF-8?q?=E2=9C=94=20GitHub=20Actions:=20Update=20?= =?UTF-8?q?used=20actions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/test_report.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml index 908a593..26f411b 100644 --- a/.github/workflows/test_report.yml +++ b/.github/workflows/test_report.yml @@ -12,7 +12,7 @@ jobs: report: runs-on: ubuntu-latest steps: - - uses: dorny/test-reporter@v1.7.0 + - uses: dorny/test-reporter@v1 with: artifact: /test-results-(.*)/ name: 'Tests Results - $1' From cd68a973cb43ce33790d6f52612a684d933a31e4 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 26 May 2024 09:18:00 +0200 Subject: [PATCH 05/67] Fix typo Signed-off-by: Stefan Weil --- src/dinglehopper/extracted_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py index 6dcf0a7..acfbf78 100644 --- a/src/dinglehopper/extracted_text.py +++ b/src/dinglehopper/extracted_text.py @@ -149,7 +149,7 @@ class ExtractedText: raise ValueError("Can't have joiner without segments to join") if self.segments is not None: if value not in ("", " ", "\n"): - raise ValueError(f"Unexcepted segment joiner value {repr(value)}") + raise ValueError(f"Unexpected segment joiner value {repr(value)}") @_text.validator def is_valid_text(self, _, value): From 4047f8b6e537158233f69d2257062b0038d122a0 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 9 Jul 2024 21:01:31 +0200 Subject: [PATCH 06/67] =?UTF-8?q?=F0=9F=90=9B=20Fix=20loading=20ocrd-tool.?= =?UTF-8?q?json=20for=20Python=203.12?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + src/dinglehopper/ocrd_cli.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 846d389..6741fa2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ multimethod >= 1.3 tqdm rapidfuzz >= 2.7.0 chardet +importlib_resources diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 8eebdc0..401db6b 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,3 +1,4 @@ +import importlib_resources import json import os @@ -5,11 +6,14 @@ import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id -from pkg_resources import resource_string from .cli import process as cli_process -OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) +OCRD_TOOL = json.loads( + importlib_resources.files(__name__) + .joinpath("ocrd-tool.json") + .read_text(encoding="utf-8", errors="strict") +) @click.command() From d1a224761537fe0239c1486c2f6bc778d70ef76e Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 9 Jul 2024 21:07:59 +0200 Subject: [PATCH 07/67] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a2cf1e..504773b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.4 + rev: v0.5.1 hooks: - args: - --fix @@ -24,7 +24,7 @@ repos: id: ruff - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 + rev: v1.10.1 hooks: - additional_dependencies: - types-setuptools @@ -36,6 +36,6 @@ repos: id: mypy - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update - rev: v0.3.1post2 + rev: v0.3.3post1 hooks: - id: pre-commit-update From 2ee37ed4e39b4e973bc807ebba26a97afed578c5 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 11 Jul 2024 16:25:38 +0200 Subject: [PATCH 08/67] =?UTF-8?q?=F0=9F=8E=A8=20Sort=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 401db6b..cfaca39 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,8 +1,8 @@ -import importlib_resources import json import os import click +import importlib_resources from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id From cf998443c15673fcef976cd00251b38cf7158a0e Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 11 Jul 2024 17:15:24 +0200 Subject: [PATCH 09/67] =?UTF-8?q?=E2=9A=99=20ruff:=20Update=20settings=20(?= =?UTF-8?q?select=20=E2=86=92=20lint.select)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 41d45ba..25efdcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,5 +74,5 @@ disallow_untyped_defs = false disallow_untyped_calls = false -[tool.ruff] +[tool.ruff.lint] select = ["E", "F", "I"] From 129e6eb427b0d5d306f76c7f443ee7cd08e83495 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 11 Jul 2024 17:25:38 +0200 Subject: [PATCH 10/67] =?UTF-8?q?=F0=9F=93=A6=20v0.9.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index 27ee989..f4572c7 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.6", + "version": "0.9.7", "git_url": "https://github.com/qurator-spk/dinglehopper", "tools": { "ocrd-dinglehopper": { From 2e9e88cc1e3db0390636a5ca79f315b1c0d153e1 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 19 Jul 2024 09:56:40 +0200 Subject: [PATCH 11/67] =?UTF-8?q?=E2=9A=99=20pre-commit:=20Update=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 504773b..b6f88ef 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.1 + rev: v0.5.3 hooks: - args: - --fix From 27ad145c7e303439ef413505b1cb1178bc23370b Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 19 Jul 2024 09:58:01 +0200 Subject: [PATCH 12/67] =?UTF-8?q?=E2=9A=99=20pyproject.toml:=20Add=20licen?= =?UTF-8?q?se.file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 25efdcd..c2263e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ ] description = "The OCR evaluation tool" readme = "README.md" +license.file = "LICENSE" requires-python = ">=3.8" keywords = ["qurator", "ocr", "evaluation", "ocr-d"] From 6d1daf1dfe99575655ec0de0f7155c047e3b9b30 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 19 Jul 2024 14:41:54 +0200 Subject: [PATCH 13/67] =?UTF-8?q?=E2=9C=A8=20Support=20--version=20option?= =?UTF-8?q?=20in=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index 78ac33c..b67e9cc 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -234,6 +234,7 @@ def process_dir( metavar="LEVEL", ) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") +@click.version_option() def main( gt, ocr, From f2e290dffea6179a21d35afea0c1902ca1bbb0fa Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 19 Jul 2024 14:54:46 +0200 Subject: [PATCH 14/67] =?UTF-8?q?=F0=9F=90=9B=20Fix=20--version=20option?= =?UTF-8?q?=20in=20OCR-D=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index cfaca39..4da4960 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -25,6 +25,7 @@ def ocrd_dinglehopper(*args, **kwargs): class OcrdDinglehopperEvaluate(Processor): def __init__(self, *args, **kwargs): kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] + kwargs["version"] = OCRD_TOOL["version"] super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) def process(self): From 3233dbcc8f036ebe83ae268813006f1476218d7c Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 22 Jul 2024 16:54:33 +0200 Subject: [PATCH 15/67] =?UTF-8?q?=E2=9C=94=20pre-commit:=20Add=20license?= =?UTF-8?q?=20check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 10 ++++++++-- pyproject.toml | 31 +++++++++++++++++++++++++++++++ requirements-dev.txt | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b6f88ef..4baed11 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.3 + rev: v0.5.4 hooks: - args: - --fix @@ -24,7 +24,7 @@ repos: id: ruff - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.1 + rev: v1.11.0 hooks: - additional_dependencies: - types-setuptools @@ -39,3 +39,9 @@ repos: rev: v0.3.3post1 hooks: - id: pre-commit-update + +- repo: https://github.com/dhatim/python-license-check + rev: 0.9.2 + hooks: + - id: liccheck + language: system diff --git a/pyproject.toml b/pyproject.toml index c2263e0..62bac78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,3 +77,34 @@ disallow_untyped_calls = false [tool.ruff.lint] select = ["E", "F", "I"] + + +[tool.liccheck] +authorized_licenses = [ + "bsd", + "new bsd", + "bsd license", + "new bsd license", + "simplified bsd", + "apache", + "apache 2.0", + "apache software license", + "apache software", + "apache license 2.0", + "gnu lgpl", + "lgpl with exceptions or zpl", + "GNU Library or Lesser General Public License (LGPL)", + "GNU Lesser General Public License v3 (LGPLv3)", + "GNU Lesser General Public License v2 or later (LGPLv2+)", + "mit", + "mit license", + "python software foundation", + "Historical Permission Notice and Disclaimer (HPND)", + "public domain", + 'The Unlicense (Unlicense)', + "isc", + 'Mozilla Public License 2.0 (MPL 2.0)', +] +unauthorized_licenses = [ + "gpl v3", +] diff --git a/requirements-dev.txt b/requirements-dev.txt index 16ae880..f9f748a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,3 +10,5 @@ mypy types-lxml types-setuptools pytest-mypy + +liccheck From 1753ed4d1363c9d22cdb56494dbeb7eaed78901b Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 6 May 2024 17:02:52 +0200 Subject: [PATCH 16/67] =?UTF-8?q?=E2=9C=94=20Test=20on=20Python=203.13?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f40c830..387f7a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ] runs-on: "ubuntu-latest" @@ -34,6 +34,7 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + allow-prereleases: true - name: Checkout uses: actions/checkout@v4 From 9c7c104dcec89663a4ddab0e83334db79a639184 Mon Sep 17 00:00:00 2001 From: joschrew <91774427+joschrew@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:29:36 +0200 Subject: [PATCH 17/67] Add Dockerfile and Makefile to create ocr-d image --- Dockerfile | 22 ++++++++++++++++++++++ Makefile | 26 ++++++++++++++++++++++++++ pyproject.toml | 2 +- 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 Dockerfile create mode 100644 Makefile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a66d718 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +ARG DOCKER_BASE_IMAGE +FROM $DOCKER_BASE_IMAGE +ARG VCS_REF +ARG BUILD_DATE +LABEL \ + maintainer="https://ocr-d.de/kontakt" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ + org.label-schema.build-date=$BUILD_DATE + +WORKDIR /build/dinglehopper +COPY pyproject.toml . +COPY src/dinglehopper/ocrd-tool.json . +COPY src ./src +COPY requirements.txt . +COPY README.md . +COPY Makefile . +RUN make install +RUN rm -rf /build/dinglehopper + +WORKDIR /data +VOLUME ["/data"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..babaf5f --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +PYTHON = python3 +PIP = pip3 +PYTHONIOENCODING=utf8 + +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_TAG = ocrd/dinglehopper + +help: + @echo + @echo " Targets" + @echo + @echo " install Install full Python package via pip" + @echo " docker Build the ocrd/dinglehopper docker image" + +# Install Python package via pip +install: + $(PIP) install . + +docker: + docker build \ + --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ + --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ + --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + -t $(DOCKER_TAG) . + +.PHONY: help install docker diff --git a/pyproject.toml b/pyproject.toml index 62bac78..a94e0b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]} where = ["src"] [tool.setuptools.package-data] -dinglehopper = ["templates/*"] +dinglehopper = ["templates/*", "*.json"] [tool.pytest.ini_options] From 6ecf49a355eb2f413a38552ca8187ab794b98d3f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 7 Oct 2024 17:39:42 +0200 Subject: [PATCH 18/67] Update Dockerfile Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a66d718..a7bda6f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,11 @@ FROM $DOCKER_BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ - maintainer="https://ocr-d.de/kontakt" \ +LABEL \ + maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ + org.label-schema.build-date=$BUILD_DATE org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ org.label-schema.build-date=$BUILD_DATE From 6b82293670ea7b642d65e7114a9a4d0c8897a619 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Mon, 7 Oct 2024 17:41:59 +0200 Subject: [PATCH 19/67] Update Dockerfile I fancy-clicked @bertsky's change suggestion, which duplicated some labels. Now fancy-clicking the fix, fingers crossed... --- Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index a7bda6f..04e7330 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,15 +2,11 @@ ARG DOCKER_BASE_IMAGE FROM $DOCKER_BASE_IMAGE ARG VCS_REF ARG BUILD_DATE -LABEL \ LABEL \ maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ org.label-schema.build-date=$BUILD_DATE - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ - org.label-schema.build-date=$BUILD_DATE WORKDIR /build/dinglehopper COPY pyproject.toml . From 058042accbb7a9425220714c947a5e50193d9220 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 08:59:58 +0200 Subject: [PATCH 20/67] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4baed11..403658e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -11,12 +11,12 @@ repos: - id: check-ast - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 25.1.0 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.4 + rev: v0.11.5 hooks: - args: - --fix @@ -24,7 +24,7 @@ repos: id: ruff - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.0 + rev: v1.15.0 hooks: - additional_dependencies: - types-setuptools @@ -36,7 +36,7 @@ repos: id: mypy - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update - rev: v0.3.3post1 + rev: v0.6.1 hooks: - id: pre-commit-update From 7f8a8dd56453ef4df6290615a60011355e247287 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 09:10:43 +0200 Subject: [PATCH 21/67] =?UTF-8?q?=F0=9F=90=9B=20Fix=20for=20changed=20API?= =?UTF-8?q?=20of=20uniseg's=20word=5Fbreak?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- src/dinglehopper/word_error_rate.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6741fa2..0b3d819 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ click jinja2 lxml -uniseg >= 0.8.0 +uniseg >= 0.9.1 numpy colorama MarkupSafe diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index 578850f..ec039b3 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -22,11 +22,11 @@ def patch_word_break(): """ old_word_break = uniseg.wordbreak.word_break - def new_word_break(c, index=0): + def new_word_break(c): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area - return uniseg.wordbreak.WordBreak.ALETTER + return uniseg.wordbreak.Word_Break.ALetter else: - return old_word_break(c, index) + return old_word_break(c) uniseg.wordbreak.word_break = new_word_break global word_break_patched From badfa9c99e07fa3faf09c276cd1f8bc3745e5b9a Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 09:25:44 +0200 Subject: [PATCH 22/67] =?UTF-8?q?=E2=9A=99=20=20GitHub=20Actions:=20Don't?= =?UTF-8?q?=20test=20on=20Python=203.8=20anymore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 387f7a2..277d4ba 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ] + python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] runs-on: "ubuntu-latest" From ce7886af23f2f43691a81002da72060dac902ae4 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 10:57:10 +0200 Subject: [PATCH 23/67] =?UTF-8?q?=E2=9A=99=20=20pyproject.toml:=20Update?= =?UTF-8?q?=20supported=20Python=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a94e0b9..7668e13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ authors = [ description = "The OCR evaluation tool" readme = "README.md" license.file = "LICENSE" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["qurator", "ocr", "evaluation", "ocr-d"] dynamic = ["version", "dependencies", "optional-dependencies"] From d3aa9eb5201833859c15049cf8203085a1dd7fca Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 11:09:33 +0200 Subject: [PATCH 24/67] =?UTF-8?q?=E2=9A=99=20=20liccheck:=20update=20permi?= =?UTF-8?q?ssable=20licenses=20(mit-cmu,=20psf=202.0,=20iscl)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7668e13..3c02d33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,11 +98,15 @@ authorized_licenses = [ "GNU Lesser General Public License v2 or later (LGPLv2+)", "mit", "mit license", + "mit-cmu", "python software foundation", + "psf", + "psf-2.0", "Historical Permission Notice and Disclaimer (HPND)", "public domain", 'The Unlicense (Unlicense)', "isc", + "ISC License (ISCL)", 'Mozilla Public License 2.0 (MPL 2.0)', ] unauthorized_licenses = [ From 63031b30bff9a7dc0033e8da4f3dd646e3e93949 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Apr 2025 17:25:17 +0200 Subject: [PATCH 25/67] Port to OCR-D/core API v3 --- .dockerignore | 5 ++ Dockerfile | 33 +++++++---- Makefile | 9 ++- src/dinglehopper/ocrd-tool.json | 10 +--- src/dinglehopper/ocrd_cli.py | 100 ++++++++++++++------------------ 5 files changed, 84 insertions(+), 73 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a8312db --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +src/dinglehopper/tests +dist +build +*.egg-info +.git diff --git a/Dockerfile b/Dockerfile index 04e7330..d4b2b76 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,17 +6,30 @@ LABEL \ maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="dinglehopper" \ + org.opencontainers.image.description="The OCR evaluation tool" \ + org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ + org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/dinglehopper -COPY pyproject.toml . -COPY src/dinglehopper/ocrd-tool.json . -COPY src ./src -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN make install -RUN rm -rf /build/dinglehopper +COPY . . +RUN make install && rm -rf /build/dinglehopper WORKDIR /data -VOLUME ["/data"] +VOLUME /data diff --git a/Makefile b/Makefile index babaf5f..2a4b13c 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 +PYTEST_ARGS = -vv -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/dinglehopper help: @@ -16,6 +17,12 @@ help: install: $(PIP) install . +install-dev: + $(PIP) install -e . + +test: + pytest $(PYTEST_ARGS) + docker: docker build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index f4572c7..00d5d2b 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,17 +1,13 @@ { "version": "0.9.7", "git_url": "https://github.com/qurator-spk/dinglehopper", + "dockerhub": "ocrd/dinglehopper", "tools": { "ocrd-dinglehopper": { "executable": "ocrd-dinglehopper", + "input_file_grp_cardinality": 2, + "output_file_grp_cardinality": 1, "description": "Evaluate OCR text against ground truth with dinglehopper", - "input_file_grp": [ - "OCR-D-GT-PAGE", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-OCR-EVAL" - ], "categories": [ "Quality assurance" ], diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 4da4960..9696ff9 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,83 +1,73 @@ -import json +from functools import cached_property import os +from typing import Optional import click -import importlib_resources +from ocrd_models import OcrdFileType from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id +from ocrd_utils import make_file_id from .cli import process as cli_process -OCRD_TOOL = json.loads( - importlib_resources.files(__name__) - .joinpath("ocrd-tool.json") - .read_text(encoding="utf-8", errors="strict") -) - - @click.command() @ocrd_cli_options def ocrd_dinglehopper(*args, **kwargs): return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) - class OcrdDinglehopperEvaluate(Processor): - def __init__(self, *args, **kwargs): - kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] - kwargs["version"] = OCRD_TOOL["version"] - super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) - def process(self): - assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") - assert_file_grp_cardinality(self.output_file_grp, 1) + @cached_property + def executable(self): + return 'ocrd-dinglehopper' - log = getLogger("processor.OcrdDinglehopperEvaluate") + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - gt_grp, ocr_grp = self.input_file_grp.split(",") - input_file_tuples = self.zip_input_files(on_error="abort") - for n, (gt_file, ocr_file) in enumerate(input_file_tuples): - if not gt_file or not ocr_file: - # file/page was not found in this group - continue - gt_file = self.workspace.download_file(gt_file) - ocr_file = self.workspace.download_file(ocr_file) - page_id = gt_file.pageId + try: + gt_file, ocr_file = input_files + assert gt_file, 'missing GT file' + assert ocr_file, 'missing OCR file' + assert gt_file.local_filename + assert ocr_file.local_filename + except (ValueError, AssertionError) as err: + self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? + return - log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) + page_id = gt_file.pageId - file_id = make_file_id(ocr_file, self.output_file_grp) - report_prefix = os.path.join(self.output_file_grp, file_id) + file_id = make_file_id(ocr_file, self.output_file_grp) + report_prefix = os.path.join(self.output_file_grp, file_id) - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass - cli_process( - gt_file.local_filename, - ocr_file.local_filename, - report_prefix, - metrics=metrics, - textequiv_level=textequiv_level, + # Process the files + try: + os.mkdir(self.output_file_grp) + except FileExistsError: + pass + cli_process( + gt_file.local_filename, + ocr_file.local_filename, + report_prefix, + metrics=metrics, + textequiv_level=textequiv_level, + ) + + # Add reports to the workspace + for report_suffix, mimetype in [ + [".html", "text/html"], + [".json", "application/json"], + ]: + self.workspace.add_file( + file_id=file_id + report_suffix, + file_grp=self.output_file_grp, + page_id=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix, ) - # Add reports to the workspace - for report_suffix, mimetype in [ - [".html", "text/html"], - [".json", "application/json"], - ]: - self.workspace.add_file( - file_id=file_id + report_suffix, - file_grp=self.output_file_grp, - page_id=page_id, - mimetype=mimetype, - local_filename=report_prefix + report_suffix, - ) - if __name__ == "__main__": ocrd_dinglehopper() From f287386c0e8b315a077e2400965b56c1e9759cc4 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Wed, 16 Apr 2025 14:49:23 +0200 Subject: [PATCH 26/67] =?UTF-8?q?=F0=9F=A7=B9Don't=20pin=20uniseg=20and=20?= =?UTF-8?q?rapidfuzz?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Breakage with the newest uniseg API was fixed in master. Can't see any issue with rapidfuzz, so removing that pin, too. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0b3d819..123187b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ uniseg >= 0.9.1 numpy colorama MarkupSafe -ocrd >= 2.65.0 +ocrd >= 3.3.0 attrs multimethod >= 1.3 tqdm From 8c1b6d65f57f1fba9c7e71980cb97934460b7073 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Apr 2025 17:49:53 +0200 Subject: [PATCH 27/67] Dockerfile: build ocrd-all-tool.json --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index d4b2b76..75dfcdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,9 @@ ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/dinglehopper COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json RUN make install && rm -rf /build/dinglehopper WORKDIR /data From c0aa82d18885402ddc0093dfc75a07e0c23a0e5b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 14:00:05 +0200 Subject: [PATCH 28/67] OCR-D processor: properly handle missing or non-downloaded GT/OCR file Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 9696ff9..52da817 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -27,14 +27,19 @@ class OcrdDinglehopperEvaluate(Processor): metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - try: - gt_file, ocr_file = input_files - assert gt_file, 'missing GT file' - assert ocr_file, 'missing OCR file' - assert gt_file.local_filename - assert ocr_file.local_filename - except (ValueError, AssertionError) as err: - self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page? + # wrong number of inputs: let fail + gt_file, ocr_file = input_files + # missing on either side: skip (zip_input_files already warned) + if not gt_file or not ocr_file: + return + # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): + if not gt_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) + return + if not ocr_file.local_filename: + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) return page_id = gt_file.pageId From 4162836612661a0232ff8783af56c65561df8c48 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 18:54:58 +0200 Subject: [PATCH 29/67] ocrd_cli: no need to check fileGrp dir exists Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 52da817..90db7d1 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -45,17 +45,11 @@ class OcrdDinglehopperEvaluate(Processor): page_id = gt_file.pageId file_id = make_file_id(ocr_file, self.output_file_grp) - report_prefix = os.path.join(self.output_file_grp, file_id) - - # Process the files - try: - os.mkdir(self.output_file_grp) - except FileExistsError: - pass cli_process( gt_file.local_filename, ocr_file.local_filename, - report_prefix, + file_id, + self.output_file_grp, metrics=metrics, textequiv_level=textequiv_level, ) From f6a2c94520dcf79892278320b29e3906d4a5f4bb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Apr 2025 18:55:42 +0200 Subject: [PATCH 30/67] ocrd_cli: but do check for existing output files Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/dinglehopper/ocrd_cli.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 90db7d1..dbf59be 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -59,8 +59,12 @@ class OcrdDinglehopperEvaluate(Processor): [".html", "text/html"], [".json", "application/json"], ]: + output_file_id = file_id + report_suffix + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set") self.workspace.add_file( - file_id=file_id + report_suffix, + file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, mimetype=mimetype, From 831a24fc4ca606cc04bd37a8217a52654e67d3f4 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 16 Apr 2025 19:03:13 +0200 Subject: [PATCH 31/67] typo: report_prefix -> file_id --- src/dinglehopper/ocrd_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index dbf59be..fa4747f 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -68,7 +68,7 @@ class OcrdDinglehopperEvaluate(Processor): file_grp=self.output_file_grp, page_id=page_id, mimetype=mimetype, - local_filename=report_prefix + report_suffix, + local_filename=file_id + report_suffix, ) From b7bdca4ac88a57660814aa83848ff1b2f86fecd6 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:09:06 +0200 Subject: [PATCH 32/67] =?UTF-8?q?=F0=9F=90=9B=20Makefile:=20Make=20phony?= =?UTF-8?q?=20targets=20.PHONY?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2a4b13c..12f342a 100644 --- a/Makefile +++ b/Makefile @@ -30,4 +30,4 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -.PHONY: help install docker +.PHONY: help install install-dev test docker From d974369e13e3bf5f20e24084a27b912430717150 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:10:56 +0200 Subject: [PATCH 33/67] =?UTF-8?q?=F0=9F=90=9B=20Docker:=20Fix=20descriptio?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 75dfcdd..f942d78 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ LABEL \ org.label-schema.build-date=$BUILD_DATE \ org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ org.opencontainers.image.title="dinglehopper" \ - org.opencontainers.image.description="The OCR evaluation tool" \ + org.opencontainers.image.description="An OCR evaluation tool" \ org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ org.opencontainers.image.revision=$VCS_REF \ From 13ab1ae150481b915c856700c6b0348fb4ba6884 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:26:36 +0200 Subject: [PATCH 34/67] =?UTF-8?q?=F0=9F=90=9B=20Docker:=20Use=20same=20ven?= =?UTF-8?q?dor=20as=20license=20for=20now?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f942d78..e497d16 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ LABEL \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ org.label-schema.build-date=$BUILD_DATE \ - org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.vendor="qurator" \ org.opencontainers.image.title="dinglehopper" \ org.opencontainers.image.description="An OCR evaluation tool" \ org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ From ef817cb343a28241ad5acf4ca956551c816450fb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:37:37 +0200 Subject: [PATCH 35/67] =?UTF-8?q?=F0=9F=93=A6=20v0.10.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index 00d5d2b..f63392a 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.9.7", + "version": "0.10.0", "git_url": "https://github.com/qurator-spk/dinglehopper", "dockerhub": "ocrd/dinglehopper", "tools": { From f6dfb77f94b69637d8d6ee8153d5ebfa3d6de90f Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 17 Apr 2025 08:51:32 +0200 Subject: [PATCH 36/67] =?UTF-8?q?=F0=9F=90=9B=20pyproject.toml:=20Fix=20de?= =?UTF-8?q?scription?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3c02d33..9dabb41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ authors = [ {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, ] -description = "The OCR evaluation tool" +description = "An OCR evaluation tool" readme = "README.md" license.file = "LICENSE" requires-python = ">=3.9" From 64444dd419c7f758ee7ebb42db3746ee016fab7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Apr 2025 16:08:41 +0200 Subject: [PATCH 37/67] opt out of 7f8a8dd5 (uniseg update that requires py39) --- requirements.txt | 2 +- src/dinglehopper/word_error_rate.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 123187b..653ec59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ click jinja2 lxml -uniseg >= 0.9.1 +uniseg >= 0.8.0 numpy colorama MarkupSafe diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index ec039b3..f2db504 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -21,10 +21,15 @@ def patch_word_break(): https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt """ old_word_break = uniseg.wordbreak.word_break + if hasattr(uniseg.wordbreak, 'Word_Break'): + aletter = uniseg.wordbreak.Word_Break.ALetter + else: + # uniseg<0.9 + aletter = uniseg.wordbreak.WordBreak.ALETTER def new_word_break(c): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area - return uniseg.wordbreak.Word_Break.ALetter + return aletter else: return old_word_break(c) From ea33602336f063e68002dbd73e03d617c74dc7e2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Apr 2025 16:09:42 +0200 Subject: [PATCH 38/67] CI: reactivate py38 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 277d4ba..387f7a2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ] runs-on: "ubuntu-latest" From a24623b966911040b951d6763e22d7da1d750b90 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Apr 2025 16:47:13 +0200 Subject: [PATCH 39/67] re-allow py38 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9dabb41..62fae82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ authors = [ description = "An OCR evaluation tool" readme = "README.md" license.file = "LICENSE" -requires-python = ">=3.9" +requires-python = ">=3.8" keywords = ["qurator", "ocr", "evaluation", "ocr-d"] dynamic = ["version", "dependencies", "optional-dependencies"] From 817e0c95f7537ad3c219118b50d39769b2f353a7 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 10:32:29 +0200 Subject: [PATCH 40/67] =?UTF-8?q?=F0=9F=93=A6=20v0.10.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index f63392a..43795e1 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.10.0", + "version": "0.10.1", "git_url": "https://github.com/qurator-spk/dinglehopper", "dockerhub": "ocrd/dinglehopper", "tools": { From 6bf5bd71780f78f3cef4468628886922c2dbe3c3 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 11:48:44 +0200 Subject: [PATCH 41/67] =?UTF-8?q?=F0=9F=90=9B=20Fix=20vendor=20strings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- LICENSE | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e497d16..c9b5523 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ LABEL \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ org.label-schema.build-date=$BUILD_DATE \ - org.opencontainers.image.vendor="qurator" \ + org.opencontainers.image.vendor="Staatsbibliothek zu Berlin — SPK" \ org.opencontainers.image.title="dinglehopper" \ org.opencontainers.image.description="An OCR evaluation tool" \ org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ diff --git a/LICENSE b/LICENSE index 9b7a833..221c706 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2019 qurator + Copyright 2019-2025 Staatsbibliothek zu Berlin — SPK Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From d8403421fcf583f3941776659651e2f41663a4ef Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 12:30:47 +0200 Subject: [PATCH 42/67] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 403658e..c7e6782 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.5 + rev: v0.11.6 hooks: - args: - --fix From 4024e350f7f5379bfffe81d45ba31bf376a4f4db Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 12:32:07 +0100 Subject: [PATCH 43/67] =?UTF-8?q?=F0=9F=9A=A7=20Test=20new=20flexible=20li?= =?UTF-8?q?ne=20dirs=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/line_dirs_test.py | 148 ++++++++++++++++++ .../line_dirs_test/basic/gt/a.gt.txt | 1 + .../line_dirs_test/basic/gt/b.gt.txt | 1 + .../line_dirs_test/basic/ocr/a.some-ocr.txt | 1 + .../line_dirs_test/basic/ocr/b.some-ocr.txt | 1 + .../line_dirs_test/merged/a/a.dummy.jpg | 0 .../line_dirs_test/merged/a/a.gt.txt | 1 + .../line_dirs_test/merged/a/a.some-ocr.txt | 1 + .../line_dirs_test/merged/b/b.dummy.jpg | 0 .../line_dirs_test/merged/b/b.gt.txt | 1 + .../line_dirs_test/merged/b/b.some-ocr.txt | 1 + .../line_dirs_test/subdirs/gt/a/a.gt.txt | 1 + .../line_dirs_test/subdirs/gt/b/b.gt.txt | 1 + .../subdirs/ocr/a/a.some-ocr.txt | 1 + .../subdirs/ocr/b/b.some-ocr.txt | 1 + 15 files changed, 160 insertions(+) create mode 100644 src/dinglehopper/line_dirs_test.py create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt create mode 100644 src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py new file mode 100644 index 0000000..676fe22 --- /dev/null +++ b/src/dinglehopper/line_dirs_test.py @@ -0,0 +1,148 @@ +import os.path +import itertools +from typing import Iterator, Tuple + +def is_hidden(filepath): + filename = os.path.basename(os.path.abspath(filepath)) + return filename.startswith(".") + +def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: + """ + Find all files in dir_, returning filenames + + If pred is given, pred(filename) must be True for the filename. + + Does not return hidden files by default. + """ + for root, _, filenames in os.walk(dir_): + for fn in filenames: + if not return_hidden and is_hidden(fn): + continue + if pred and not pred(fn): + continue + yield os.path.join(root, fn) + + +def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: + """ + Find GT files and matching OCR files. + + Returns pairs of GT and OCR files. + """ + for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): + ocr_fn = os.path.join( + ocr_dir, + os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) + + ocr_suffix, + ) + if not os.path.exists(ocr_fn): + raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") + + yield gt_fn, ocr_fn + +def all_equal(iterable): + g = itertools.groupby(iterable) + return next(g, True) and not next(g, False) + +def common_prefix(its): + return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] + + +def common_suffix(its): + return reversed(common_prefix(reversed(it) for it in its)) + + +def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): + """ + Find GT files and matching OCR files, autodetect suffixes. + + This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) + files with a common suffix. Currently the files must have a suffix, e.g. + ".gt.txt" (e.g. ".ocr.txt"). + + Returns pairs of GT and OCR files. + """ + + # Autodetect suffixes + gt_files = find_all_files(gt_dir) + gt_suffix = "".join(common_suffix(gt_files)) + if len(gt_suffix) == 0: + raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") + ocr_files = find_all_files(ocr_dir) + ocr_suffix = "".join(common_suffix(ocr_files)) + if len(ocr_suffix) == 0: + raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") + + yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) + + +def test_basic(): + """Test the dumb method: User gives directories and suffixes.""" + pairs = list( + find_gt_and_ocr_files( + "line_dirs_test/basic/gt", + ".gt.txt", + "line_dirs_test/basic/ocr", + ".some-ocr.txt", + ) + ) + + assert len(pairs) == 2 + +def test_basic_autodetect(): + """Test the autodetect method: User gives directories, suffixes are autodetected if possible""" + pairs = list( + find_gt_and_ocr_files_autodetect( + "line_dirs_test/basic/gt", + "line_dirs_test/basic/ocr", + ) + ) + + assert len(pairs) == 2 + + +def test_subdirs(): + """Test the dumb method: Should also work when subdirectories are involved.""" + pairs = list( + find_gt_and_ocr_files( + "line_dirs_test/subdirs/gt", + ".gt.txt", + "line_dirs_test/subdirs/ocr", + ".some-ocr.txt", + ) + ) + + assert len(pairs) == 2 + + +def test_subdirs_autodetect(): + """Test the autodetect method: Should also work when subdirectories are involved.""" + pairs = list( + find_gt_and_ocr_files_autodetect( + "line_dirs_test/subdirs/gt", + "line_dirs_test/subdirs/ocr", + ) + ) + + assert len(pairs) == 2 + +def test_merged(): + """Test the dumb method: Should also work when GT and OCR texts are in the same directories.""" + pairs = list( + find_gt_and_ocr_files( + "line_dirs_test/merged", + ".gt.txt", + "line_dirs_test/merged", + ".some-ocr.txt", + ) + ) + + assert len(pairs) == 2 + +if __name__ == "__main__": + test_basic() + test_subdirs() + test_merged() + + test_basic_autodetect() + test_subdirs_autodetect() diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt new file mode 100644 index 0000000..484ba93 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt @@ -0,0 +1 @@ +This is a test. diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt new file mode 100644 index 0000000..fc9bd6a --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt @@ -0,0 +1 @@ +Another test. diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt new file mode 100644 index 0000000..27cf4bf --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt @@ -0,0 +1 @@ +Tis is a test. diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt new file mode 100644 index 0000000..0bc0e40 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt @@ -0,0 +1 @@ +AnÖther test. diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg new file mode 100644 index 0000000..e69de29 diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt new file mode 100644 index 0000000..484ba93 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt @@ -0,0 +1 @@ +This is a test. diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt new file mode 100644 index 0000000..27cf4bf --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt @@ -0,0 +1 @@ +Tis is a test. diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg new file mode 100644 index 0000000..e69de29 diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt new file mode 100644 index 0000000..fc9bd6a --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt @@ -0,0 +1 @@ +Another test. diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt new file mode 100644 index 0000000..0bc0e40 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt @@ -0,0 +1 @@ +AnÖther test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt new file mode 100644 index 0000000..484ba93 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt @@ -0,0 +1 @@ +This is a test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt new file mode 100644 index 0000000..fc9bd6a --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt @@ -0,0 +1 @@ +Another test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt new file mode 100644 index 0000000..27cf4bf --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt @@ -0,0 +1 @@ +Tis is a test. diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt new file mode 100644 index 0000000..0bc0e40 --- /dev/null +++ b/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt @@ -0,0 +1 @@ +AnÖther test. From ad8e6de36bf376a830af29e31cefa43066e5baff Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 12:34:08 +0100 Subject: [PATCH 44/67] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?= =?UTF-8?q?=20character=20diff=20reports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 03bf374..01fd585 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -81,7 +81,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): joiner="", none="·", score_hint=score_hint(l_cer, l_n_characters), - ) + )[0] word_diff_report += gen_diff_report( gt_words, ocr_words, @@ -89,7 +89,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): joiner=" ", none="⋯", score_hint=score_hint(l_wer, l_n_words), - ) + )[0] env = Environment( loader=FileSystemLoader( From 2bf2529c380f028e59953584aa2aa26dc3a828b5 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 12:50:14 +0100 Subject: [PATCH 45/67] =?UTF-8?q?=F0=9F=9A=A7=20Port=20new=20line=20dir=20?= =?UTF-8?q?functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 83 +++++++++++++++++++++++++----- src/dinglehopper/line_dirs_test.py | 71 ------------------------- 2 files changed, 69 insertions(+), 85 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 01fd585..43e4f1a 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,5 +1,6 @@ import itertools import os +from typing import Iterator, Tuple import click from jinja2 import Environment, FileSystemLoader @@ -12,11 +13,36 @@ from .ocr_files import plain_extract from .word_error_rate import word_error_rate_n, words_normalized +def removesuffix(text, suffix): + if suffix and text.endswith(suffix): + return text[: -len(suffix)] + return text + +def is_hidden(filepath): + filename = os.path.basename(os.path.abspath(filepath)) + return filename.startswith(".") + +def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: + """ + Find all files in dir_, returning filenames + + If pred is given, pred(filename) must be True for the filename. + + Does not return hidden files by default. + """ + for root, _, filenames in os.walk(dir_): + for fn in filenames: + if not return_hidden and is_hidden(fn): + continue + if pred and not pred(fn): + continue + yield os.path.join(root, fn) + + def all_equal(iterable): g = itertools.groupby(iterable) return next(g, True) and not next(g, False) - def common_prefix(its): return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] @@ -24,16 +50,49 @@ def common_prefix(its): def common_suffix(its): return reversed(common_prefix(reversed(it) for it in its)) +def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: + """ + Find GT files and matching OCR files. -def removesuffix(text, suffix): - if suffix and text.endswith(suffix): - return text[: -len(suffix)] - return text + Returns pairs of GT and OCR files. + """ + for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): + ocr_fn = os.path.join( + ocr_dir, + os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) + + ocr_suffix, + ) + if not os.path.exists(ocr_fn): + raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") + + yield gt_fn, ocr_fn + + +def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): + """ + Find GT files and matching OCR files, autodetect suffixes. + + This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) + files with a common suffix. Currently the files must have a suffix, e.g. + ".gt.txt" (e.g. ".ocr.txt"). + + Returns pairs of GT and OCR files. + """ + + # Autodetect suffixes + gt_files = find_all_files(gt_dir) + gt_suffix = "".join(common_suffix(gt_files)) + if len(gt_suffix) == 0: + raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") + ocr_files = find_all_files(ocr_dir) + ocr_suffix = "".join(common_suffix(ocr_files)) + if len(ocr_suffix) == 0: + raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") + + yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): - gt_suffix = "".join(common_suffix(os.listdir(gt_dir))) - ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir))) cer = None n_characters = None @@ -42,14 +101,10 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_words = None word_diff_report = "" - for k, gt in enumerate(os.listdir(gt_dir)): - # Find a match by replacing the suffix - ocr = removesuffix(gt, gt_suffix) + ocr_suffix + for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)): - gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) - ocr_text = plain_extract( - os.path.join(ocr_dir, ocr), include_filename_in_id=True - ) + gt_text = plain_extract(gt_fn, include_filename_in_id=True) + ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) gt_words = words_normalized(gt_text) ocr_words = words_normalized(ocr_text) diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/line_dirs_test.py index 676fe22..9827f01 100644 --- a/src/dinglehopper/line_dirs_test.py +++ b/src/dinglehopper/line_dirs_test.py @@ -2,78 +2,7 @@ import os.path import itertools from typing import Iterator, Tuple -def is_hidden(filepath): - filename = os.path.basename(os.path.abspath(filepath)) - return filename.startswith(".") -def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: - """ - Find all files in dir_, returning filenames - - If pred is given, pred(filename) must be True for the filename. - - Does not return hidden files by default. - """ - for root, _, filenames in os.walk(dir_): - for fn in filenames: - if not return_hidden and is_hidden(fn): - continue - if pred and not pred(fn): - continue - yield os.path.join(root, fn) - - -def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: - """ - Find GT files and matching OCR files. - - Returns pairs of GT and OCR files. - """ - for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): - ocr_fn = os.path.join( - ocr_dir, - os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) - + ocr_suffix, - ) - if not os.path.exists(ocr_fn): - raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") - - yield gt_fn, ocr_fn - -def all_equal(iterable): - g = itertools.groupby(iterable) - return next(g, True) and not next(g, False) - -def common_prefix(its): - return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] - - -def common_suffix(its): - return reversed(common_prefix(reversed(it) for it in its)) - - -def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): - """ - Find GT files and matching OCR files, autodetect suffixes. - - This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) - files with a common suffix. Currently the files must have a suffix, e.g. - ".gt.txt" (e.g. ".ocr.txt"). - - Returns pairs of GT and OCR files. - """ - - # Autodetect suffixes - gt_files = find_all_files(gt_dir) - gt_suffix = "".join(common_suffix(gt_files)) - if len(gt_suffix) == 0: - raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") - ocr_files = find_all_files(ocr_dir) - ocr_suffix = "".join(common_suffix(ocr_files)) - if len(ocr_suffix) == 0: - raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") - - yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) def test_basic(): From 6980d7a2526380833ffd4d964e1f1b4c58bfed8a Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 13:21:49 +0100 Subject: [PATCH 46/67] =?UTF-8?q?=F0=9F=9A=A7=20Use=20our=20own=20removesu?= =?UTF-8?q?ffix()=20as=20we=20still=20support=20Python=203.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 43e4f1a..30b2be1 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -14,6 +14,11 @@ from .word_error_rate import word_error_rate_n, words_normalized def removesuffix(text, suffix): + """ + Remove suffix from text. + + Can be replaced with str.removesuffix when we only support Python >= 3.9. + """ if suffix and text.endswith(suffix): return text[: -len(suffix)] return text @@ -59,7 +64,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): ocr_fn = os.path.join( ocr_dir, - os.path.relpath(gt_fn, start=gt_dir).removesuffix(gt_suffix) + removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix, ) if not os.path.exists(ocr_fn): From 73ee16fe5181c29a06f7460ed1fb1dadd84d6cc2 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 13:59:55 +0100 Subject: [PATCH 47/67] =?UTF-8?q?=F0=9F=9A=A7=20Support=20'merged'=20GT+OC?= =?UTF-8?q?R=20line=20directories?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 30b2be1..44305d6 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -97,7 +97,7 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): +def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None): cer = None n_characters = None @@ -106,8 +106,12 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): n_words = None word_diff_report = "" - for k, (gt_fn, ocr_fn) in enumerate(find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)): + if gt_suffix is not None and ocr_suffix is not None: + gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) + else: + gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) + for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): gt_text = plain_extract(gt_fn, include_filename_in_id=True) ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) gt_words = words_normalized(gt_text) @@ -183,17 +187,25 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): @click.option( "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" ) -def main(gt, ocr, report_prefix, metrics): +@click.option("--gt-suffix", help="Suffix of GT line text files") +@click.option("--ocr-suffix", help="Suffix of OCR line text files") +def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): """ Compare the GT line text directory against the OCR line text directory. This assumes that the GT line text directory contains textfiles with a common suffix like ".gt.txt", and the OCR line text directory contains textfiles with a common suffix like ".some-ocr.txt". The text files also need to be paired, - i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" - in the OCT lines directory. + i.e. the GT filename "line001.gt.txt" needs to match a filename + "line001.some-ocr.txt" in the OCR lines directory. - The GT and OCR directories are usually round truth line texts and the results of + GT and OCR directories may contain line text files in matching subdirectories, + e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt". + + GT and OCR directories can also be the same directory, but in this case you need + to give --gt-suffix and --ocr-suffix explicitly. + + The GT and OCR directories are usually ground truth line texts and the results of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. @@ -204,7 +216,7 @@ def main(gt, ocr, report_prefix, metrics): """ initLogging() - process(gt, ocr, report_prefix, metrics=metrics) + process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix) if __name__ == "__main__": From 68344e48f870968a92c6c51afb759c1fa47dea2b Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 14:49:40 +0100 Subject: [PATCH 48/67] =?UTF-8?q?=F0=9F=8E=A8=20Reformat=20cli=5Fline=5Fdi?= =?UTF-8?q?rs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 34 +++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 44305d6..9e806a1 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -23,11 +23,13 @@ def removesuffix(text, suffix): return text[: -len(suffix)] return text + def is_hidden(filepath): filename = os.path.basename(os.path.abspath(filepath)) return filename.startswith(".") -def find_all_files(dir_: str, pred=None, return_hidden=False) -> Iterator[str]: + +def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]: """ Find all files in dir_, returning filenames @@ -48,6 +50,7 @@ def all_equal(iterable): g = itertools.groupby(iterable) return next(g, True) and not next(g, False) + def common_prefix(its): return [p[0] for p in itertools.takewhile(all_equal, zip(*its))] @@ -55,7 +58,10 @@ def common_prefix(its): def common_suffix(its): return reversed(common_prefix(reversed(it) for it in its)) -def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tuple[str, str]]: + +def find_gt_and_ocr_files( + gt_dir, gt_suffix, ocr_dir, ocr_suffix +) -> Iterator[Tuple[str, str]]: """ Find GT files and matching OCR files. @@ -64,8 +70,7 @@ def find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -> Iterator[Tu for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): ocr_fn = os.path.join( ocr_dir, - removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) - + ocr_suffix, + removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix, ) if not os.path.exists(ocr_fn): raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") @@ -88,16 +93,22 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): gt_files = find_all_files(gt_dir) gt_suffix = "".join(common_suffix(gt_files)) if len(gt_suffix) == 0: - raise RuntimeError(f"Files in GT directory {gt_dir} do not have a common suffix") + raise RuntimeError( + f"Files in GT directory {gt_dir} do not have a common suffix" + ) ocr_files = find_all_files(ocr_dir) ocr_suffix = "".join(common_suffix(ocr_files)) if len(ocr_suffix) == 0: - raise RuntimeError(f"Files in OCR directory {ocr_dir} do not have a common suffix") + raise RuntimeError( + f"Files in OCR directory {ocr_dir} do not have a common suffix" + ) yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) -def process(gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None): +def process( + gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None +): cer = None n_characters = None @@ -216,7 +227,14 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): """ initLogging() - process(gt, ocr, report_prefix, metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix) + process( + gt, + ocr, + report_prefix, + metrics=metrics, + gt_suffix=gt_suffix, + ocr_suffix=ocr_suffix, + ) if __name__ == "__main__": From 9414a92f9f31760a694c44f06069f7677e679078 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 15:19:37 +0100 Subject: [PATCH 49/67] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Type-?= =?UTF-8?q?annotate=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 9e806a1..2cd4fe6 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,6 @@ import itertools import os -from typing import Iterator, Tuple +from typing import Callable, Iterator, Optional, Tuple import click from jinja2 import Environment, FileSystemLoader @@ -29,7 +29,9 @@ def is_hidden(filepath): return filename.startswith(".") -def find_all_files(dir_: str, pred: Callable[[str], bool]=None, return_hidden: bool=False) -> Iterator[str]: +def find_all_files( + dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False +) -> Iterator[str]: """ Find all files in dir_, returning filenames @@ -60,7 +62,7 @@ def common_suffix(its): def find_gt_and_ocr_files( - gt_dir, gt_suffix, ocr_dir, ocr_suffix + gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str ) -> Iterator[Tuple[str, str]]: """ Find GT files and matching OCR files. From c37316da097d18b74f0da2398b53b64ab712495f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 12 Dec 2024 19:57:12 +0100 Subject: [PATCH 50/67] =?UTF-8?q?=F0=9F=90=9B=20cli=5Fline=5Fdirs:=20Fix?= =?UTF-8?q?=20word=20differences=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the time of generation of the section, the {gt,ocr}_words generators were drained. Fix by using a list. Fixes gh-124. --- src/dinglehopper/cli_line_dirs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 2cd4fe6..2861d6f 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,6 @@ import itertools import os -from typing import Callable, Iterator, Optional, Tuple +from typing import Callable, Iterator, Optional, Tuple, List import click from jinja2 import Environment, FileSystemLoader @@ -127,8 +127,8 @@ def process( for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): gt_text = plain_extract(gt_fn, include_filename_in_id=True) ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) - gt_words = words_normalized(gt_text) - ocr_words = words_normalized(ocr_text) + gt_words: List[str] = list(words_normalized(gt_text)) + ocr_words: List[str] = list(words_normalized(ocr_text)) # Compute CER l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) From 322faeb26c2c60d8d777ab6132b9af397d0fd510 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 09:21:09 +0100 Subject: [PATCH 51/67] =?UTF-8?q?=F0=9F=8E=A8=20Sort=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 2861d6f..5cd1bfa 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,6 @@ import itertools import os -from typing import Callable, Iterator, Optional, Tuple, List +from typing import Callable, Iterator, List, Optional, Tuple import click from jinja2 import Environment, FileSystemLoader From 3b16c14c16dd00500574b74031107768d5cbb465 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 09:50:24 +0100 Subject: [PATCH 52/67] =?UTF-8?q?=E2=9C=94=20=20Properly=20test=20line=20d?= =?UTF-8?q?ir=20finding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .../data/line_dirs}/basic/gt/a.gt.txt | 0 .../data/line_dirs}/basic/gt/b.gt.txt | 0 .../data/line_dirs}/basic/ocr/a.some-ocr.txt | 0 .../data/line_dirs}/basic/ocr/b.some-ocr.txt | 0 .../data/line_dirs}/merged/a/a.dummy.jpg | 0 .../data/line_dirs}/merged/a/a.gt.txt | 0 .../data/line_dirs}/merged/a/a.some-ocr.txt | 0 .../data/line_dirs}/merged/b/b.dummy.jpg | 0 .../data/line_dirs}/merged/b/b.gt.txt | 0 .../data/line_dirs}/merged/b/b.some-ocr.txt | 0 .../data/line_dirs}/subdirs/gt/a/a.gt.txt | 0 .../data/line_dirs}/subdirs/gt/b/b.gt.txt | 0 .../line_dirs}/subdirs/ocr/a/a.some-ocr.txt | 0 .../line_dirs}/subdirs/ocr/b/b.some-ocr.txt | 0 .../test_line_dirs.py} | 40 ++++++++----------- 16 files changed, 18 insertions(+), 23 deletions(-) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/a.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/gt/b.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/a.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/basic/ocr/b.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.dummy.jpg (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/a/a.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.dummy.jpg (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/merged/b/b.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/a/a.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/gt/b/b.gt.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/a/a.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test => tests/data/line_dirs}/subdirs/ocr/b/b.some-ocr.txt (100%) rename src/dinglehopper/{line_dirs_test.py => tests/test_line_dirs.py} (52%) diff --git a/.gitignore b/.gitignore index d931831..66d66bc 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ dmypy.json # User-specific stuff .idea +.*.swp # Build artifacts /build diff --git a/src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/gt/a.gt.txt rename to src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt diff --git a/src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/gt/b.gt.txt rename to src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/ocr/a.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/basic/ocr/b.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/a/a.dummy.jpg rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/a/a.gt.txt rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt diff --git a/src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/a/a.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/b/b.dummy.jpg rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/b/b.gt.txt rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt diff --git a/src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/merged/b/b.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/gt/a/a.gt.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/gt/b/b.gt.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/ocr/a/a.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt similarity index 100% rename from src/dinglehopper/line_dirs_test/subdirs/ocr/b/b.some-ocr.txt rename to src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt diff --git a/src/dinglehopper/line_dirs_test.py b/src/dinglehopper/tests/test_line_dirs.py similarity index 52% rename from src/dinglehopper/line_dirs_test.py rename to src/dinglehopper/tests/test_line_dirs.py index 9827f01..03966e1 100644 --- a/src/dinglehopper/line_dirs_test.py +++ b/src/dinglehopper/tests/test_line_dirs.py @@ -1,29 +1,30 @@ -import os.path -import itertools -from typing import Iterator, Tuple +import os +from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def test_basic(): """Test the dumb method: User gives directories and suffixes.""" pairs = list( find_gt_and_ocr_files( - "line_dirs_test/basic/gt", + os.path.join(data_dir, "line_dirs/basic/gt"), ".gt.txt", - "line_dirs_test/basic/ocr", + os.path.join(data_dir, "line_dirs/basic/ocr"), ".some-ocr.txt", ) ) assert len(pairs) == 2 + def test_basic_autodetect(): - """Test the autodetect method: User gives directories, suffixes are autodetected if possible""" + """Test autodetect: User gives directories, suffixes are autodetected if possible""" pairs = list( find_gt_and_ocr_files_autodetect( - "line_dirs_test/basic/gt", - "line_dirs_test/basic/ocr", + os.path.join(data_dir, "line_dirs/basic/gt"), + os.path.join(data_dir, "line_dirs/basic/ocr"), ) ) @@ -34,9 +35,9 @@ def test_subdirs(): """Test the dumb method: Should also work when subdirectories are involved.""" pairs = list( find_gt_and_ocr_files( - "line_dirs_test/subdirs/gt", + os.path.join(data_dir, "line_dirs/subdirs/gt"), ".gt.txt", - "line_dirs_test/subdirs/ocr", + os.path.join(data_dir, "line_dirs/subdirs/ocr"), ".some-ocr.txt", ) ) @@ -48,30 +49,23 @@ def test_subdirs_autodetect(): """Test the autodetect method: Should also work when subdirectories are involved.""" pairs = list( find_gt_and_ocr_files_autodetect( - "line_dirs_test/subdirs/gt", - "line_dirs_test/subdirs/ocr", + os.path.join(data_dir, "line_dirs/subdirs/gt"), + os.path.join(data_dir, "line_dirs/subdirs/ocr"), ) ) assert len(pairs) == 2 + def test_merged(): - """Test the dumb method: Should also work when GT and OCR texts are in the same directories.""" + """Test the dumb method: GT and OCR texts are in the same directories.""" pairs = list( find_gt_and_ocr_files( - "line_dirs_test/merged", + os.path.join(data_dir, "line_dirs/merged"), ".gt.txt", - "line_dirs_test/merged", + os.path.join(data_dir, "line_dirs/merged"), ".some-ocr.txt", ) ) assert len(pairs) == 2 - -if __name__ == "__main__": - test_basic() - test_subdirs() - test_merged() - - test_basic_autodetect() - test_subdirs_autodetect() From f1a586cff1d306d3fbef95c8110af74d3941a894 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 10:36:58 +0100 Subject: [PATCH 53/67] =?UTF-8?q?=E2=9C=94=20=20Test=20line=20dirs=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_integ_cli_line_dirs.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 src/dinglehopper/tests/test_integ_cli_line_dirs.py diff --git a/src/dinglehopper/tests/test_integ_cli_line_dirs.py b/src/dinglehopper/tests/test_integ_cli_line_dirs.py new file mode 100644 index 0000000..90cbabf --- /dev/null +++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py @@ -0,0 +1,61 @@ +import json +import os.path +import re + +import pytest + +from ..cli_line_dirs import process +from .util import working_directory + +data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +@pytest.mark.integration +def test_cli_line_dirs_basic(tmp_path): + """Test that the cli/process() produces a good report""" + + with working_directory(tmp_path): + gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") + ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") + process(gt_dir, ocr_dir, "report") + with open("report.json", "r") as jsonf: + print(jsonf.read()) + with open("report.json", "r") as jsonf: + j = json.load(jsonf) + assert j["cer"] == pytest.approx(0.1071429) + assert j["wer"] == pytest.approx(0.5) + + +@pytest.mark.integration +def test_cli_line_dirs_basic_report_diff(tmp_path): + """Test that the cli/process() produces a report wiff char+word diff""" + + with working_directory(tmp_path): + gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") + ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") + process(gt_dir, ocr_dir, "report") + + with open("report.html", "r") as htmlf: + html_report = htmlf.read() + + # Counting GT lines in the diff + assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2 + assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2 + + +@pytest.mark.integration +def test_cli_line_dirs_merged(tmp_path): + """Test that the cli/process() produces a good report""" + + with working_directory(tmp_path): + gt_dir = os.path.join(data_dir, "line_dirs/merged") + ocr_dir = os.path.join(data_dir, "line_dirs/merged") + process( + gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt" + ) + with open("report.json", "r") as jsonf: + print(jsonf.read()) + with open("report.json", "r") as jsonf: + j = json.load(jsonf) + assert j["cer"] == pytest.approx(0.1071429) + assert j["wer"] == pytest.approx(0.5) From 480b3cf864ba1ba5c26ed550760b53193b91e93d Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Sat, 14 Dec 2024 11:14:07 +0100 Subject: [PATCH 54/67] =?UTF-8?q?=E2=9C=94=20=20Test=20that=20CLI=20produc?= =?UTF-8?q?es=20a=20complete=20HTML=20report?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...json.py => test_integ_cli_valid_report.py} | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) rename src/dinglehopper/tests/{test_integ_cli_valid_json.py => test_integ_cli_valid_report.py} (64%) diff --git a/src/dinglehopper/tests/test_integ_cli_valid_json.py b/src/dinglehopper/tests/test_integ_cli_valid_report.py similarity index 64% rename from src/dinglehopper/tests/test_integ_cli_valid_json.py rename to src/dinglehopper/tests/test_integ_cli_valid_report.py index 6cbfa0c..fed0d28 100644 --- a/src/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py @@ -1,4 +1,5 @@ import json +import re import pytest @@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path): with open("report.json", "r") as jsonf: j = json.load(jsonf) assert j["cer"] == pytest.approx(float("inf")) + + +@pytest.mark.integration +def test_cli_html(tmp_path): + """Test that the cli/process() yields complete HTML report""" + + with working_directory(tmp_path): + with open("gt.txt", "w") as gtf: + gtf.write("AAAAA") + with open("ocr.txt", "w") as ocrf: + ocrf.write("AAAAB") + + process("gt.txt", "ocr.txt", "report") + + with open("report.html", "r") as htmlf: + html_report = htmlf.read() + print(html_report) + + assert re.search(r"CER: 0\.\d+", html_report) + assert re.search(r"WER: 1\.0", html_report) + assert len(re.findall("gt.*cdiff", html_report)) == 1 + assert len(re.findall("gt.*wdiff", html_report)) == 1 From cf59b951a3a30cd23e36a0bb2e553f2d6abcee20 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Tue, 4 Feb 2025 13:54:28 +0100 Subject: [PATCH 55/67] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text?= =?UTF-8?q?=20encoding=20to=20line=20dir=20cli?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_line_dirs.py | 27 +++++++++++++++++++++++---- src/dinglehopper/ocr_files.py | 22 ++++++++++++++++------ 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 5cd1bfa..4064de0 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -109,7 +109,14 @@ def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): def process( - gt_dir, ocr_dir, report_prefix, *, metrics=True, gt_suffix=None, ocr_suffix=None + gt_dir, + ocr_dir, + report_prefix, + *, + metrics=True, + gt_suffix=None, + ocr_suffix=None, + plain_encoding="autodetect", ): cer = None @@ -125,8 +132,12 @@ def process( gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): - gt_text = plain_extract(gt_fn, include_filename_in_id=True) - ocr_text = plain_extract(ocr_fn, include_filename_in_id=True) + gt_text = plain_extract( + gt_fn, include_filename_in_id=True, encoding=plain_encoding + ) + ocr_text = plain_extract( + ocr_fn, include_filename_in_id=True, encoding=plain_encoding + ) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -202,7 +213,12 @@ def process( ) @click.option("--gt-suffix", help="Suffix of GT line text files") @click.option("--ocr-suffix", help="Suffix of OCR line text files") -def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) +def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): """ Compare the GT line text directory against the OCR line text directory. @@ -227,6 +243,8 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). + It is recommended to specify the encoding of the text files, for example with + --plain-encoding utf-8. If this option is not given, we try to auto-detect it. """ initLogging() process( @@ -236,6 +254,7 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix): metrics=metrics, gt_suffix=gt_suffix, ocr_suffix=ocr_suffix, + plain_encoding=plain_encoding, ) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 1593f44..1eecebb 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional import chardet from lxml import etree as ET from lxml.etree import XMLSyntaxError +from ocrd_utils import getLogger from uniseg.graphemecluster import grapheme_clusters from .extracted_text import ExtractedText, normalize_sbb +log = getLogger("processor.OcrdDinglehopperEvaluate") + def alto_namespace(tree: ET._ElementTree) -> Optional[str]: """Return the ALTO namespace used in the given ElementTree. @@ -149,7 +152,7 @@ def detect_encoding(filename): return chardet.detect(open(filename, "rb").read(1024))["encoding"] -def plain_extract(filename, include_filename_in_id=False): +def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"): id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" def make_segment(no, line): @@ -163,7 +166,14 @@ def plain_extract(filename, include_filename_in_id=False): clusters, ) - fileencoding = detect_encoding(filename) + if encoding == "autodetect": + fileencoding = detect_encoding(filename) + log.warn( + f"Autodetected encoding as '{fileencoding}'" + ", it is recommended to specify it explicitly with --plain-encoding" + ) + else: + fileencoding = encoding with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, @@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False): # XXX hardcoded SBB normalization -def plain_text(filename): - return plain_extract(filename).text +def plain_text(filename, encoding="autodetect"): + return plain_extract(filename, encoding=encoding).text -def extract(filename, *, textequiv_level="region"): +def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"): """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. @@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"): try: tree = ET.parse(filename) except (XMLSyntaxError, UnicodeDecodeError): - return plain_extract(filename) + return plain_extract(filename, encoding=plain_encoding) try: return page_extract(tree, textequiv_level=textequiv_level) except ValueError: From 5578ce83a3600bbe6f6a0a2679f2b35c90b34fe4 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:39:29 +0100 Subject: [PATCH 56/67] =?UTF-8?q?=F0=9F=9A=A7=20Add=20option=20for=20text?= =?UTF-8?q?=20encoding=20to=20line=20dir=20cli?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index b67e9cc..5e5e81c 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -114,6 +114,7 @@ def process( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", + plain_encoding: str = "autodetect", ) -> None: """Check OCR result against GT. @@ -121,8 +122,12 @@ def process( this undecorated version and use Click on a wrapper. """ - gt_text = extract(gt, textequiv_level=textequiv_level) - ocr_text = extract(ocr, textequiv_level=textequiv_level) + gt_text = extract( + gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ) + ocr_text = extract( + ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -195,6 +200,7 @@ def process_dir( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", + plain_encoding: str = "autodetect", ) -> None: for gt_file in os.listdir(gt): gt_file_path = os.path.join(gt, gt_file) @@ -209,6 +215,7 @@ def process_dir( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) else: print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) @@ -233,6 +240,11 @@ def process_dir( help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.version_option() def main( @@ -243,6 +255,7 @@ def main( metrics, differences, textequiv_level, + plain_encoding, progress, ): """ @@ -280,6 +293,7 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) else: process( @@ -290,6 +304,7 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) From 9db5b4caf5b6335066e121a231cee1b1298bfbfa Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:48:50 +0100 Subject: [PATCH 57/67] =?UTF-8?q?=F0=9F=9A=A7=20Add=20OCR-D=20parameter=20?= =?UTF-8?q?for=20plain=20text=20encoding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 5 +++++ src/dinglehopper/ocrd_cli.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index 43795e1..ae7c9bb 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -25,6 +25,11 @@ "enum": ["region", "line"], "default": "region", "description": "PAGE XML hierarchy level to extract the text from" + }, + "plain_encoding": { + "type": "string", + "default": "autodetect", + "description": "Encoding (e.g. \"utf-8\") of plain text files" } } } diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index fa4747f..2d7da8e 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -26,6 +26,7 @@ class OcrdDinglehopperEvaluate(Processor): assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] + plain_encoding = self.parameter["plain_encoding"] # wrong number of inputs: let fail gt_file, ocr_file = input_files @@ -52,6 +53,7 @@ class OcrdDinglehopperEvaluate(Processor): self.output_file_grp, metrics=metrics, textequiv_level=textequiv_level, + plain_encoding=plain_encoding, ) # Add reports to the workspace From 224aa02163b5ba28a4f44569b4cbb04d0dae4188 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Thu, 13 Feb 2025 16:50:21 +0100 Subject: [PATCH 58/67] =?UTF-8?q?=F0=9F=9A=A7=20Fix=20help=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli.py | 2 +- src/dinglehopper/cli_line_dirs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index 5e5e81c..2d3c075 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -243,7 +243,7 @@ def process_dir( @click.option( "--plain-encoding", default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', + help='Encoding (e.g. "utf-8") of plain text files', ) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.version_option() diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 4064de0..0160f87 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -216,7 +216,7 @@ def process( @click.option( "--plain-encoding", default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', + help='Encoding (e.g. "utf-8") of plain text files', ) def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): """ From a70260c10edbff774fcae1d3f636b2b5e806d4ae Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 13:56:13 +0200 Subject: [PATCH 59/67] =?UTF-8?q?=F0=9F=90=9B=20Use=20warning()=20to=20fix?= =?UTF-8?q?=20DeprecationWarning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocr_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index 1eecebb..fdcaf54 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -168,7 +168,7 @@ def plain_extract(filename, include_filename_in_id=False, encoding="autodetect") if encoding == "autodetect": fileencoding = detect_encoding(filename) - log.warn( + log.warning( f"Autodetected encoding as '{fileencoding}'" ", it is recommended to specify it explicitly with --plain-encoding" ) From 14a4bc56d85bd953153bf64bcb95a92413814efb Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Tue, 22 Apr 2025 18:24:35 +0200 Subject: [PATCH 60/67] =?UTF-8?q?=F0=9F=90=9B=20Add=20--plain-encoding=20o?= =?UTF-8?q?ption=20to=20dinglehopper-extract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/cli_extract.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py index 9c51d34..5fce032 100644 --- a/src/dinglehopper/cli_extract.py +++ b/src/dinglehopper/cli_extract.py @@ -12,7 +12,12 @@ from .ocr_files import extract help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) -def main(input_file, textequiv_level): +@click.option( + "--plain-encoding", + default="autodetect", + help='Encoding (e.g. "utf-8") of plain text files', +) +def main(input_file, textequiv_level, plain_encoding): """ Extract the text of the given INPUT_FILE. @@ -23,7 +28,9 @@ def main(input_file, textequiv_level): use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() - input_text = extract(input_file, textequiv_level=textequiv_level).text + input_text = extract( + input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding + ).text print(input_text) From 9fc8937324b8ba2c94ddd865fb8c05fa5f92c49d Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 24 Apr 2025 15:13:19 +0200 Subject: [PATCH 61/67] =?UTF-8?q?=E2=9C=92=20=20README:=20Mention=20dingle?= =?UTF-8?q?hopper-line-dirs=20--help?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76fcc5a..a40db79 100644 --- a/README.md +++ b/README.md @@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt. with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate CLI interface: -~~~ +``` dinglehopper-line-dirs gt/ ocr/ -~~~ +``` + +The CLI `dinglehopper-line-dirs` can also work with GT text files in the same +directories as the the OCR text files. You should read `dinglehopper-line-dirs --help` +in this case. ### dinglehopper-extract The tool `dinglehopper-extract` extracts the text of the given input file on From 5639f3db7f12647694c4ef03437af00227f45f58 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 24 Apr 2025 16:44:29 +0200 Subject: [PATCH 62/67] =?UTF-8?q?=E2=9C=94=20=20Add=20a=20tests=20that=20c?= =?UTF-8?q?hecks=20if=20plain=20text=20files=20with=20BOM=20are=20read=20c?= =?UTF-8?q?orrectly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/tests/test_ocr_files.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 342507a..0c2a500 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -182,3 +182,15 @@ def test_plain(tmp_path): result = plain_text("ocr.txt") expected = "First, a line.\nAnd a second line." assert result == expected + + +def test_plain_BOM(tmp_path): + """Test that plain text files with BOM are read correctly.""" + BOM = "\ufeff" + with working_directory(tmp_path): + with open("ocr.txt", "w") as ocrf: + ocrf.write(BOM + "First, a line.\nAnd a second line.\n") + + result = plain_text("ocr.txt") + expected = "First, a line.\nAnd a second line." + assert result == expected From 628594ef98df634f3c411c780a4bccd26bb07526 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 24 Apr 2025 17:14:44 +0200 Subject: [PATCH 63/67] =?UTF-8?q?=F0=9F=93=A6=20v0.11.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dinglehopper/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index 43795e1..6fad45a 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.10.1", + "version": "0.11.0", "git_url": "https://github.com/qurator-spk/dinglehopper", "dockerhub": "ocrd/dinglehopper", "tools": { From 1ebb004386501986562aa0b927f543d9dfa6068c Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 25 Apr 2025 10:13:06 +0200 Subject: [PATCH 64/67] =?UTF-8?q?=E2=9A=99=20=20pre-commit:=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c7e6782..345060d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.6 + rev: v0.11.7 hooks: - args: - --fix From 774790c36f7e1477d383bfb0f1771dc523953524 Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Fri, 25 Apr 2025 11:20:00 +0200 Subject: [PATCH 65/67] =?UTF-8?q?=E2=9C=94=20=20GitHub=20Actions:=20Make?= =?UTF-8?q?=20reporting=20results=20clearer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the "Actions" tab on GitHub, the workflow run that would post test results to the _original_ workflow run is named "Test Report". This would lead me to click on it to see the results, just to be disappointed. This aims to make the naming of the GitHub workflows/jobs clearer. --- .github/workflows/test.yml | 2 +- .github/workflows/test_report.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 387f7a2..db089d0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: Test +name: 'Test' on: diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml index 26f411b..5579d8c 100644 --- a/.github/workflows/test_report.yml +++ b/.github/workflows/test_report.yml @@ -1,4 +1,4 @@ -name: 'Test Report' +name: 'Test - Report results' on: workflow_run: workflows: ['test'] @@ -15,6 +15,6 @@ jobs: - uses: dorny/test-reporter@v1 with: artifact: /test-results-(.*)/ - name: 'Tests Results - $1' + name: 'test - Results ($1)' path: '*junit.xml' reporter: java-junit From d09e3969f820c425d65e27d5b33baca9b191f9c1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 May 2025 00:18:38 +0200 Subject: [PATCH 66/67] docker: prepackage ocrd-all-module-dir.json --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index c9b5523..7064efc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,6 +32,8 @@ COPY . . COPY ocrd-tool.json . # prepackage ocrd-tool.json as ocrd-all-tool.json RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# prepackage ocrd-all-module-dir.json +RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json RUN make install && rm -rf /build/dinglehopper WORKDIR /data From b1ef3af1a8725cd9053941542772b17b66a5cbe5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 May 2025 00:18:35 +0200 Subject: [PATCH 67/67] docker: use latest core base stage --- Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 12f342a..3729311 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,9 @@ PIP = pip3 PYTHONIOENCODING=utf8 PYTEST_ARGS = -vv -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 -DOCKER_TAG = ocrd/dinglehopper +DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest +DOCKER_TAG ?= ocrd/dinglehopper +DOCKER ?= docker help: @echo @@ -24,7 +25,7 @@ test: pytest $(PYTEST_ARGS) docker: - docker build \ + $(DOCKER) build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \