Merge ba45129abf into 071e6a8bd1

2025-10-18 03:59:56 +02:00 · 2025-04-11 15:49:58 +00:00 · 2025-04-11 15:49:58 +00:00 · 139b3a1cc6
commit 139b3a1cc6
parent 071e6a8bd1 ba45129abf
17 changed files with 115 additions and 111 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,5 @@
 src/dinglehopper/tests
 dist
 build
 *.egg-info
 .git
--- a/36
+++ b/36
@ -6,17 +6,33 @@ LABEL \
    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
    org.label-schema.vcs-ref=$VCS_REF \
    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
-    org.label-schema.build-date=$BUILD_DATE
+    org.label-schema.build-date=$BUILD_DATE \
    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
    org.opencontainers.image.title="dinglehopper" \
    org.opencontainers.image.description="The OCR evaluation tool" \
    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
    org.opencontainers.image.revision=$VCS_REF \
    org.opencontainers.image.created=$BUILD_DATE \
    org.opencontainers.image.base.name=ocrd/core
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
 # avoid HOME/.local/share (hard to predict USER here)
 # so let XDG_DATA_HOME coincide with fixed system location
 # (can still be overridden by derived stages)
 ENV XDG_DATA_HOME /usr/local/share
 # avoid the need for an extra volume for persistent resource user db
 # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
 ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 WORKDIR /build/dinglehopper
-COPY pyproject.toml .
+COPY . .
-COPY src/dinglehopper/ocrd-tool.json .
+COPY ocrd-tool.json .
-COPY src ./src
+# prepackage ocrd-tool.json as ocrd-all-tool.json
-COPY requirements.txt .
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
-COPY README.md .
+RUN make install && rm -rf /build/dinglehopper
 COPY Makefile .
 RUN make install
 RUN rm -rf /build/dinglehopper
 WORKDIR /data
-VOLUME ["/data"]
+VOLUME /data
--- a/9
+++ b/9
@ -1,8 +1,9 @@
 PYTHON = python3
 PIP = pip3
 PYTHONIOENCODING=utf8
 PYTEST_ARGS = -vv
-DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 DOCKER_TAG = ocrd/dinglehopper
 help:
@ -16,6 +17,12 @@ help:
 install:
 	$(PIP) install .
 install-dev:
 	$(PIP) install -e .
 test:
 	pytest $(PYTEST_ARGS)
 docker:
 	docker build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +1,16 @@
 click
 jinja2
 lxml
-uniseg >= 0.8.0
+# FIXME
 uniseg == 0.8.1
 numpy
 colorama
 MarkupSafe
-ocrd >= 2.65.0
+ocrd >= 3.3.0
 attrs
 multimethod >= 1.3
 tqdm
-rapidfuzz >= 2.7.0
+# FIXME
 rapidfuzz >= 2.7.0, < 3
 chardet
 importlib_resources
--- a/src/dinglehopper/character_error_rate.py
+++ b/src/dinglehopper/character_error_rate.py
@ -20,14 +20,7 @@ def character_error_rate_n(
    :return: character error rate and length of the reference
    """
-    d = distance(reference, compared)
+    return distance(reference, compared), len(reference)
    n = len(reference)
    if d == 0:
        return 0, n
    if n == 0:
        return float("inf"), n
    return d / n, n
    # XXX Should we really count newlines here?
--- a/src/dinglehopper/edit_distance.py
+++ b/src/dinglehopper/edit_distance.py
@ -9,18 +9,18 @@ from .extracted_text import ExtractedText
@multimethod
-def distance(seq1: List[str], seq2: List[str]) -> int:
+def distance(seq1: List[str], seq2: List[str]) -> float:
    """Compute the Levenshtein edit distance between two lists of grapheme clusters.
    This assumes that the grapheme clusters are already normalized.
    Use distance(str, str) instead if you need to compare two Unicode strings.
    """
-    return Levenshtein.distance(seq1, seq2)
+    return Levenshtein.normalized_distance(seq1, seq2)
@distance.register
-def _(s1: str, s2: str) -> int:
+def _(s1: str, s2: str) -> float:
    """Compute the Levenshtein edit distance between two Unicode strings
    Note that this is different from levenshtein() as this function knows about Unicode
@ -29,12 +29,12 @@ def _(s1: str, s2: str) -> int:
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
-    return Levenshtein.distance(seq1, seq2)
+    return Levenshtein.normalized_distance(seq1, seq2)
@distance.register
-def _(s1: ExtractedText, s2: ExtractedText) -> int:
+def _(s1: ExtractedText, s2: ExtractedText) -> float:
-    return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
+    return Levenshtein.normalized_distance(s1.grapheme_clusters, s2.grapheme_clusters)
 def editops(word1, word2):
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@ -1,17 +1,13 @@
 {
  "version": "0.9.7",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
  "dockerhub": "ocrd/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
      "executable": "ocrd-dinglehopper",
      "input_file_grp_cardinality": 2,
      "output_file_grp_cardinality": 1,
      "description": "Evaluate OCR text against ground truth with dinglehopper",
      "input_file_grp": [
        "OCR-D-GT-PAGE",
        "OCR-D-OCR"
      ],
      "output_file_grp": [
        "OCR-D-OCR-EVAL"
      ],
      "categories": [
        "Quality assurance"
      ],
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@ -1,83 +1,73 @@
-import json
+from functools import cached_property
 import os
 from typing import Optional
 import click
-import importlib_resources
+from ocrd_models import OcrdFileType
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
+from ocrd_utils import make_file_id
 from .cli import process as cli_process
 OCRD_TOOL = json.loads(
    importlib_resources.files(__name__)
    .joinpath("ocrd-tool.json")
    .read_text(encoding="utf-8", errors="strict")
 )
@click.command()
@ocrd_cli_options
 def ocrd_dinglehopper(*args, **kwargs):
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
 class OcrdDinglehopperEvaluate(Processor):
    def __init__(self, *args, **kwargs):
        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
        kwargs["version"] = OCRD_TOOL["version"]
        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
-    def process(self):
+    @cached_property
-        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
+    def executable(self):
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+        return 'ocrd-dinglehopper'
-        log = getLogger("processor.OcrdDinglehopperEvaluate")
+    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
        assert self.parameter
        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
        gt_grp, ocr_grp = self.input_file_grp.split(",")
-        input_file_tuples = self.zip_input_files(on_error="abort")
+        try:
-        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
+            gt_file, ocr_file = input_files
-            if not gt_file or not ocr_file:
+            assert gt_file, 'missing GT file'
-                # file/page was not found in this group
+            assert ocr_file, 'missing OCR file'
-                continue
+            assert gt_file.local_filename
-            gt_file = self.workspace.download_file(gt_file)
+            assert ocr_file.local_filename
-            ocr_file = self.workspace.download_file(ocr_file)
+        except (ValueError, AssertionError) as err:
-            page_id = gt_file.pageId
+            self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
            return
-            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
+        page_id = gt_file.pageId
-            file_id = make_file_id(ocr_file, self.output_file_grp)
+        file_id = make_file_id(ocr_file, self.output_file_grp)
-            report_prefix = os.path.join(self.output_file_grp, file_id)
+        report_prefix = os.path.join(self.output_file_grp, file_id)
-            # Process the files
+        # Process the files
-            try:
+        try:
-                os.mkdir(self.output_file_grp)
+            os.mkdir(self.output_file_grp)
-            except FileExistsError:
+        except FileExistsError:
-                pass
+            pass
-            cli_process(
+        cli_process(
-                gt_file.local_filename,
+            gt_file.local_filename,
-                ocr_file.local_filename,
+            ocr_file.local_filename,
-                report_prefix,
+            report_prefix,
-                metrics=metrics,
+            metrics=metrics,
-                textequiv_level=textequiv_level,
+            textequiv_level=textequiv_level,
        )
        # Add reports to the workspace
        for report_suffix, mimetype in [
            [".html", "text/html"],
            [".json", "application/json"],
        ]:
            self.workspace.add_file(
                file_id=file_id + report_suffix,
                file_grp=self.output_file_grp,
                page_id=page_id,
                mimetype=mimetype,
                local_filename=report_prefix + report_suffix,
            )
            # Add reports to the workspace
            for report_suffix, mimetype in [
                [".html", "text/html"],
                [".json", "application/json"],
            ]:
                self.workspace.add_file(
                    file_id=file_id + report_suffix,
                    file_grp=self.output_file_grp,
                    page_id=page_id,
                    mimetype=mimetype,
                    local_filename=report_prefix + report_suffix,
                )
 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/src/dinglehopper/tests/test_character_error_rate.py
+++ b/src/dinglehopper/tests/test_character_error_rate.py
@ -14,9 +14,9 @@ def test_character_error_rate():
    assert character_error_rate("Foo", "") == 3 / 3
    assert character_error_rate("", "") == 0
-    assert math.isinf(character_error_rate("", "Foo"))
+    assert character_error_rate("", "Foo") == 3 / 3
-    assert character_error_rate("Foo", "Food") == 1 / 3
+    assert character_error_rate("Foo", "Food") == 1 / 4
    assert character_error_rate("Fnord", "Food") == 2 / 5
    assert character_error_rate("Müll", "Mull") == 1 / 4
    assert character_error_rate("Abstand", "Sand") == 4 / 7
--- a/src/dinglehopper/tests/test_edit_distance.py
+++ b/src/dinglehopper/tests/test_edit_distance.py
@ -6,8 +6,8 @@ from .. import distance
 def test_distance():
-    assert distance("Fnord", "Food") == 2
+    assert distance("Fnord", "Food") == 2 / 5
-    assert distance("Müll", "Mull") == 1
+    assert distance("Müll", "Mull") == 1 / 4
    word1 = unicodedata.normalize("NFC", "Schlyñ")
    word2 = unicodedata.normalize("NFD", "Schlyñ")  # Different, decomposed!
@ -21,4 +21,4 @@ def test_distance():
    assert (
        len(word2) == 7
    )  # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
-    assert distance(word1, word2) == 1
+    assert distance(word1, word2) == 1 / 6
--- a/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py
+++ b/src/dinglehopper/tests/test_integ_character_error_rate_ocr.py
@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
        )
    )
-    assert character_error_rate(gt, ocr) == 8 / 591  # Manually verified
+    assert character_error_rate(gt, ocr) == 8 / 594  # Manually verified
--- a/src/dinglehopper/tests/test_integ_cli_valid_json.py
+++ b/src/dinglehopper/tests/test_integ_cli_valid_json.py
@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
    with working_directory(tmp_path):
        with open("gt.txt", "w") as gtf:
-            gtf.write("")  # Empty to yield CER == inf
+            gtf.write("")
        with open("ocr.txt", "w") as ocrf:
            ocrf.write("Not important")
        process("gt.txt", "ocr.txt", "report")
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
-            assert j["cer"] == pytest.approx(float("inf"))
+            assert j["cer"] == pytest.approx(1.0)
--- a/src/dinglehopper/tests/test_integ_edit_distance_ocr.py
+++ b/src/dinglehopper/tests/test_integ_edit_distance_ocr.py
@ -17,7 +17,7 @@ def test_distance_between_page_files():
    # → 2 differences
    gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
    ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
-    assert distance(gt, ocr) == 2
+    assert distance(gt, ocr) == 2 / 827
@pytest.mark.integration
@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
        )
    )
-    assert distance(gt, ocr) == 8  # Manually verified
+    assert distance(gt, ocr) == 8 / 594  # Manually verified
--- a/src/dinglehopper/tests/test_integ_empty_files.py
+++ b/src/dinglehopper/tests/test_integ_empty_files.py
@ -12,9 +12,9 @@ from .util import working_directory
@pytest.mark.parametrize(
    "gt_file_content,ocr_file_content,cer_expected",
    [
-        ("", "Lorem ipsum", math.inf),
+        ("", "Lorem ipsum", 1.0),
        ("Lorem ipsum", "", 1.0),
-        ("\ufeff", "Lorem ipsum", math.inf),
+        ("\ufeff", "Lorem ipsum", 1.0),
        ("Lorem ipsum", "\ufeff", 1.0),
        ("", "", 0.0),
        ("\ufeff", "", 0.0),
--- a/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
+++ b/src/dinglehopper/tests/test_integ_word_error_rate_ocr.py
@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
    )
    assert (
-        word_error_rate(gt, ocr) == 7 / gt_word_count
+        word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
    )  # Manually verified, 6 words are wrong, 1 got split (=2 errors)
--- a/src/dinglehopper/tests/test_word_error_rate.py
+++ b/src/dinglehopper/tests/test_word_error_rate.py
@ -76,7 +76,7 @@ def test_word_error_rate():
    )
    assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
-    assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
+    assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
    assert word_error_rate("", "") == 0
    assert (
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@ -96,15 +96,10 @@ def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
    reference_seq = list(reference)
    compared_seq = list(compared)
-    d = Levenshtein.distance(reference_seq, compared_seq)
+    d = Levenshtein.normalized_distance(reference_seq, compared_seq)
    n = len(reference_seq)
-    if d == 0:
+    return d, n
        return 0, n
    if n == 0:
        return float("inf"), n
    return d / n, n
 def word_error_rate(reference: T, compared: T) -> float:
    wer: float