Merge pull request #145 from bertsky/master

update docker
docker: use latest core base stage
2025-07-02 06:59:59 +02:00 · 2025-05-13 12:41:50 +02:00 · 2025-05-02 00:19:11 +02:00 · 2025-05-02 00:19:11 +02:00 · 2025-04-25 11:31:29 +02:00 · 2025-04-25 11:20:00 +02:00
39 changed files with 581 additions and 122 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,5 @@
+src/dinglehopper/tests
+dist
+build
+*.egg-info
+.git
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Upgrade pip
        run: python3 -m pip install --upgrade pip
      - name: Install setuptools
@ -32,7 +32,7 @@ jobs:
      - name: Build package
        run: python3 -m pip install --upgrade build && python3 -m build
      - name: Upload dist
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: dist
          path: dist/
@ -42,7 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Download dist
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist/
@ -61,7 +61,7 @@ jobs:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Download dist
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: dist
          path: dist/
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -1,4 +1,4 @@
-name: Test
+name: 'Test'

 on:

@ -25,18 +25,19 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]

    runs-on: "ubuntu-latest"

    steps:
      - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
+          allow-prereleases: true

      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Install possible lxml build requirements (if building from source)
        run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
@ -56,7 +57,7 @@ jobs:
            cd src
            python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
      - name: Upload test results
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: success() || failure()
        with:
          name: test-results-${{matrix.python-version}}
--- a/.github/workflows/test_report.yml
+++ b/.github/workflows/test_report.yml
@ -1,4 +1,4 @@
-name: 'Test Report'
+name: 'Test - Report results'
 on:
  workflow_run:
    workflows: ['test']
@ -12,9 +12,9 @@ jobs:
  report:
    runs-on: ubuntu-latest
    steps:
-      - uses: dorny/test-reporter@v1.7.0
+      - uses: dorny/test-reporter@v1
        with:
          artifact: /test-results-(.*)/
-          name: 'Tests Results - $1'
+          name: 'test - Results ($1)'
          path: '*junit.xml'
          reporter: java-junit
--- a/.gitignore
+++ b/.gitignore
@ -25,6 +25,7 @@ dmypy.json

 # User-specific stuff
 .idea
+.*.swp

 # Build artifacts
 /build
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
@ -11,12 +11,12 @@ repos:
    -   id: check-ast

 -   repo: https://github.com/psf/black
-    rev: 24.4.2
+    rev: 25.1.0
    hooks:
    -   id: black

 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.3
+    rev: v0.11.7
    hooks:
    -   args:
        -   --fix
@ -24,7 +24,7 @@ repos:
        id: ruff

 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.15.0
    hooks:
    -   additional_dependencies:
        -   types-setuptools
@ -36,6 +36,12 @@ repos:
        id: mypy

 -   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
-    rev: v0.3.1post2
+    rev: v0.6.1
    hooks:
    -   id: pre-commit-update
+
+-   repo: https://github.com/dhatim/python-license-check
+    rev: 0.9.2
+    hooks:
+    -   id: liccheck
+        language: system
--- a/40
+++ b/40
@ -0,0 +1,40 @@
+ARG DOCKER_BASE_IMAGE
+FROM $DOCKER_BASE_IMAGE
+ARG VCS_REF
+ARG BUILD_DATE
+LABEL \
+    maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
+    org.label-schema.vcs-ref=$VCS_REF \
+    org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
+    org.label-schema.build-date=$BUILD_DATE \
+    org.opencontainers.image.vendor="Staatsbibliothek zu Berlin — SPK" \
+    org.opencontainers.image.title="dinglehopper" \
+    org.opencontainers.image.description="An OCR evaluation tool" \
+    org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
+    org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
+    org.opencontainers.image.revision=$VCS_REF \
+    org.opencontainers.image.created=$BUILD_DATE \
+    org.opencontainers.image.base.name=ocrd/core
+
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+# avoid HOME/.local/share (hard to predict USER here)
+# so let XDG_DATA_HOME coincide with fixed system location
+# (can still be overridden by derived stages)
+ENV XDG_DATA_HOME /usr/local/share
+# avoid the need for an extra volume for persistent resource user db
+# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
+ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
+
+WORKDIR /build/dinglehopper
+COPY . .
+COPY ocrd-tool.json .
+# prepackage ocrd-tool.json as ocrd-all-tool.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+# prepackage ocrd-all-module-dir.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
+RUN make install && rm -rf /build/dinglehopper
+
+WORKDIR /data
+VOLUME /data
--- a/2
+++ b/2
@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright 2019 qurator
+   Copyright 2019-2025 Staatsbibliothek zu Berlin — SPK

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/34
+++ b/34
@ -0,0 +1,34 @@
+PYTHON = python3
+PIP = pip3
+PYTHONIOENCODING=utf8
+PYTEST_ARGS = -vv
+
+DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest
+DOCKER_TAG ?= ocrd/dinglehopper
+DOCKER ?= docker
+
+help:
+	@echo
+	@echo "  Targets"
+	@echo
+	@echo "    install Install full Python package via pip"
+	@echo "    docker  Build the ocrd/dinglehopper docker image"
+
+# Install Python package via pip
+install:
+	$(PIP) install .
+
+install-dev:
+	$(PIP) install -e .
+
+test:
+	pytest $(PYTEST_ARGS)
+
+docker:
+	$(DOCKER) build \
+	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
+	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
+	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
+	-t $(DOCKER_TAG) .
+
+.PHONY: help install install-dev test docker
--- a/README.md
+++ b/README.md
@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
 CLI interface:

-~~~
+```
 dinglehopper-line-dirs gt/ ocr/
-~~~
+```
+
+The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
+directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
+in this case.

 ### dinglehopper-extract
 The tool `dinglehopper-extract` extracts the text of the given input file on
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,8 +7,9 @@ authors = [
    {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
    {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
 ]
-description = "The OCR evaluation tool"
+description = "An OCR evaluation tool"
 readme = "README.md"
+license.file = "LICENSE"
 requires-python = ">=3.8"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]

@ -48,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
 where = ["src"]

 [tool.setuptools.package-data]
-dinglehopper = ["templates/*"]
+dinglehopper = ["templates/*", "*.json"]


 [tool.pytest.ini_options]
@ -74,5 +75,40 @@ disallow_untyped_defs = false
 disallow_untyped_calls = false


-[tool.ruff]
+[tool.ruff.lint]
 select = ["E", "F", "I"]
+
+
+[tool.liccheck]
+authorized_licenses = [
+    "bsd",
+    "new bsd",
+    "bsd license",
+    "new bsd license",
+    "simplified bsd",
+    "apache",
+    "apache 2.0",
+    "apache software license",
+    "apache software",
+    "apache license 2.0",
+    "gnu lgpl",
+    "lgpl with exceptions or zpl",
+    "GNU Library or Lesser General Public License (LGPL)",
+    "GNU Lesser General Public License v3 (LGPLv3)",
+    "GNU Lesser General Public License v2 or later (LGPLv2+)",
+    "mit",
+    "mit license",
+    "mit-cmu",
+    "python software foundation",
+    "psf",
+    "psf-2.0",
+    "Historical Permission Notice and Disclaimer (HPND)",
+    "public domain",
+    'The Unlicense (Unlicense)',
+    "isc",
+    "ISC License (ISCL)",
+    'Mozilla Public License 2.0 (MPL 2.0)',
+]
+unauthorized_licenses = [
+    "gpl v3",
+]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -10,3 +10,5 @@ mypy
 types-lxml
 types-setuptools
 pytest-mypy
+
+liccheck
--- a/requirements.txt
+++ b/requirements.txt
@ -5,9 +5,10 @@ uniseg >= 0.8.0
 numpy
 colorama
 MarkupSafe
-ocrd >= 2.65.0
+ocrd >= 3.3.0
 attrs
 multimethod >= 1.3
 tqdm
 rapidfuzz >= 2.7.0
 chardet
+importlib_resources
--- a/src/dinglehopper/cli.py
+++ b/src/dinglehopper/cli.py
@ -114,6 +114,7 @@ def process(
    metrics: bool = True,
    differences: bool = False,
    textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
    """Check OCR result against GT.

@ -121,8 +122,12 @@ def process(
    this undecorated version and use Click on a wrapper.
    """

-    gt_text = extract(gt, textequiv_level=textequiv_level)
-    ocr_text = extract(ocr, textequiv_level=textequiv_level)
+    gt_text = extract(
+        gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
+    ocr_text = extract(
+        ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    )
    gt_words: List[str] = list(words_normalized(gt_text))
    ocr_words: List[str] = list(words_normalized(ocr_text))

@ -195,6 +200,7 @@ def process_dir(
    metrics: bool = True,
    differences: bool = False,
    textequiv_level: str = "region",
+    plain_encoding: str = "autodetect",
 ) -> None:
    for gt_file in os.listdir(gt):
        gt_file_path = os.path.join(gt, gt_file)
@ -209,6 +215,7 @@ def process_dir(
                metrics=metrics,
                differences=differences,
                textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
            )
        else:
            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@ -233,7 +240,13 @@ def process_dir(
    help="PAGE TextEquiv level to extract text from",
    metavar="LEVEL",
 )
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
+@click.version_option()
 def main(
    gt,
    ocr,
@ -242,6 +255,7 @@ def main(
    metrics,
    differences,
    textequiv_level,
+    plain_encoding,
    progress,
 ):
    """
@ -279,6 +293,7 @@ def main(
                metrics=metrics,
                differences=differences,
                textequiv_level=textequiv_level,
+                plain_encoding=plain_encoding,
            )
    else:
        process(
@ -289,6 +304,7 @@ def main(
            metrics=metrics,
            differences=differences,
            textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
        )


--- a/src/dinglehopper/cli_extract.py
+++ b/src/dinglehopper/cli_extract.py
@ -12,7 +12,12 @@ from .ocr_files import extract
    help="PAGE TextEquiv level to extract text from",
    metavar="LEVEL",
 )
-def main(input_file, textequiv_level):
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(input_file, textequiv_level, plain_encoding):
    """
    Extract the text of the given INPUT_FILE.

@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    initLogging()
-    input_text = extract(input_file, textequiv_level=textequiv_level).text
+    input_text = extract(
+        input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
+    ).text
    print(input_text)


--- a/src/dinglehopper/cli_line_dirs.py
+++ b/src/dinglehopper/cli_line_dirs.py
@ -1,5 +1,6 @@
 import itertools
 import os
+from typing import Callable, Iterator, List, Optional, Tuple

 import click
 from jinja2 import Environment, FileSystemLoader
@ -12,6 +13,41 @@ from .ocr_files import plain_extract
 from .word_error_rate import word_error_rate_n, words_normalized


+def removesuffix(text, suffix):
+    """
+    Remove suffix from text.
+
+    Can be replaced with str.removesuffix when we only support Python >= 3.9.
+    """
+    if suffix and text.endswith(suffix):
+        return text[: -len(suffix)]
+    return text
+
+
+def is_hidden(filepath):
+    filename = os.path.basename(os.path.abspath(filepath))
+    return filename.startswith(".")
+
+
+def find_all_files(
+    dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
+) -> Iterator[str]:
+    """
+    Find all files in dir_, returning filenames
+
+    If pred is given, pred(filename) must be True for the filename.
+
+    Does not return hidden files by default.
+    """
+    for root, _, filenames in os.walk(dir_):
+        for fn in filenames:
+            if not return_hidden and is_hidden(fn):
+                continue
+            if pred and not pred(fn):
+                continue
+            yield os.path.join(root, fn)
+
+
 def all_equal(iterable):
    g = itertools.groupby(iterable)
    return next(g, True) and not next(g, False)
@ -25,15 +61,63 @@ def common_suffix(its):
    return reversed(common_prefix(reversed(it) for it in its))


-def removesuffix(text, suffix):
-    if suffix and text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text
+def find_gt_and_ocr_files(
+    gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
+) -> Iterator[Tuple[str, str]]:
+    """
+    Find GT files and matching OCR files.
+
+    Returns pairs of GT and OCR files.
+    """
+    for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
+        ocr_fn = os.path.join(
+            ocr_dir,
+            removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
+        )
+        if not os.path.exists(ocr_fn):
+            raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
+
+        yield gt_fn, ocr_fn


-def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
-    gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
-    ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
+def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
+    """
+    Find GT files and matching OCR files, autodetect suffixes.
+
+    This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
+    files with a common suffix. Currently the files must have a suffix, e.g.
+    ".gt.txt" (e.g. ".ocr.txt").
+
+    Returns pairs of GT and OCR files.
+    """
+
+    # Autodetect suffixes
+    gt_files = find_all_files(gt_dir)
+    gt_suffix = "".join(common_suffix(gt_files))
+    if len(gt_suffix) == 0:
+        raise RuntimeError(
+            f"Files in GT directory {gt_dir} do not have a common suffix"
+        )
+    ocr_files = find_all_files(ocr_dir)
+    ocr_suffix = "".join(common_suffix(ocr_files))
+    if len(ocr_suffix) == 0:
+        raise RuntimeError(
+            f"Files in OCR directory {ocr_dir} do not have a common suffix"
+        )
+
+    yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+
+
+def process(
+    gt_dir,
+    ocr_dir,
+    report_prefix,
+    *,
+    metrics=True,
+    gt_suffix=None,
+    ocr_suffix=None,
+    plain_encoding="autodetect",
+):

    cer = None
    n_characters = None
@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
    n_words = None
    word_diff_report = ""

-    for k, gt in enumerate(os.listdir(gt_dir)):
-        # Find a match by replacing the suffix
-        ocr = removesuffix(gt, gt_suffix) + ocr_suffix
+    if gt_suffix is not None and ocr_suffix is not None:
+        gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
+    else:
+        gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)

-        gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
-        ocr_text = plain_extract(
-            os.path.join(ocr_dir, ocr), include_filename_in_id=True
+    for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
+        gt_text = plain_extract(
+            gt_fn, include_filename_in_id=True, encoding=plain_encoding
        )
-        gt_words = words_normalized(gt_text)
-        ocr_words = words_normalized(ocr_text)
+        ocr_text = plain_extract(
+            ocr_fn, include_filename_in_id=True, encoding=plain_encoding
+        )
+        gt_words: List[str] = list(words_normalized(gt_text))
+        ocr_words: List[str] = list(words_normalized(ocr_text))

        # Compute CER
        l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -81,7 +169,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
            joiner="",
            none="·",
            score_hint=score_hint(l_cer, l_n_characters),
-        )
+        )[0]
        word_diff_report += gen_diff_report(
            gt_words,
            ocr_words,
@ -89,7 +177,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
            joiner=" ",
            none="⋯",
            score_hint=score_hint(l_wer, l_n_words),
-        )
+        )[0]

    env = Environment(
        loader=FileSystemLoader(
@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
@click.option(
    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
-def main(gt, ocr, report_prefix, metrics):
+@click.option("--gt-suffix", help="Suffix of GT line text files")
+@click.option("--ocr-suffix", help="Suffix of OCR line text files")
+@click.option(
+    "--plain-encoding",
+    default="autodetect",
+    help='Encoding (e.g. "utf-8") of plain text files',
+)
+def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
    """
    Compare the GT line text directory against the OCR line text directory.

    This assumes that the GT line text directory contains textfiles with a common
    suffix like ".gt.txt", and the OCR line text directory contains textfiles with
    a common suffix like ".some-ocr.txt". The text files also need to be paired,
-    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
-    in the OCT lines directory.
+    i.e. the GT filename "line001.gt.txt" needs to match a filename
+    "line001.some-ocr.txt" in the OCR lines directory.

-    The GT and OCR directories are usually round truth line texts and the results of
+    GT and OCR directories may contain line text files in matching subdirectories,
+    e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
+
+    GT and OCR directories can also be the same directory, but in this case you need
+    to give --gt-suffix and --ocr-suffix explicitly.
+
+    The GT and OCR directories are usually ground truth line texts and the results of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.
@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

+    It is recommended to specify the encoding of the text files, for example with
+    --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
    """
    initLogging()
-    process(gt, ocr, report_prefix, metrics=metrics)
+    process(
+        gt,
+        ocr,
+        report_prefix,
+        metrics=metrics,
+        gt_suffix=gt_suffix,
+        ocr_suffix=ocr_suffix,
+        plain_encoding=plain_encoding,
+    )


 if __name__ == "__main__":
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@ -149,7 +149,7 @@ class ExtractedText:
                raise ValueError("Can't have joiner without segments to join")
        if self.segments is not None:
            if value not in ("", " ", "\n"):
-                raise ValueError(f"Unexcepted segment joiner value {repr(value)}")
+                raise ValueError(f"Unexpected segment joiner value {repr(value)}")

    @_text.validator
    def is_valid_text(self, _, value):
--- a/src/dinglehopper/ocr_files.py
+++ b/src/dinglehopper/ocr_files.py
@ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional
 import chardet
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
+from ocrd_utils import getLogger
 from uniseg.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText, normalize_sbb

+log = getLogger("processor.OcrdDinglehopperEvaluate")
+

 def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
    """Return the ALTO namespace used in the given ElementTree.
@ -36,7 +39,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
    for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
        line_id = line.attrib.get("ID")
        line_text = " ".join(
-            string.attrib.get("CONTENT")
+            string.attrib.get("CONTENT", "")
            for string in line.iterfind("alto:String", namespaces=nsmap)
        )
        normalized_text = normalize_sbb(line_text)
@ -149,7 +152,7 @@ def detect_encoding(filename):
    return chardet.detect(open(filename, "rb").read(1024))["encoding"]


-def plain_extract(filename, include_filename_in_id=False):
+def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
    id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"

    def make_segment(no, line):
@ -163,11 +166,18 @@ def plain_extract(filename, include_filename_in_id=False):
            clusters,
        )

-    fileencoding = detect_encoding(filename)
+    if encoding == "autodetect":
+        fileencoding = detect_encoding(filename)
+        log.warning(
+            f"Autodetected encoding as '{fileencoding}'"
+            ", it is recommended to specify it explicitly with --plain-encoding"
+        )
+    else:
+        fileencoding = encoding
    with open(filename, "r", encoding=fileencoding) as f:
        return ExtractedText(
            None,
-            [make_segment(no, line) for no, line in enumerate(f.readlines())],
+            [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
            "\n",
            None,
            None,
@ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False):
    # XXX hardcoded SBB normalization


-def plain_text(filename):
-    return plain_extract(filename).text
+def plain_text(filename, encoding="autodetect"):
+    return plain_extract(filename, encoding=encoding).text


-def extract(filename, *, textequiv_level="region"):
+def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
    """Extract the text from the given file.

    Supports PAGE, ALTO and falls back to plain text.
@ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
    try:
        tree = ET.parse(filename)
    except (XMLSyntaxError, UnicodeDecodeError):
-        return plain_extract(filename)
+        return plain_extract(filename, encoding=plain_encoding)
    try:
        return page_extract(tree, textequiv_level=textequiv_level)
    except ValueError:
--- a/src/dinglehopper/ocrd-tool.json
+++ b/src/dinglehopper/ocrd-tool.json
@ -1,17 +1,13 @@
 {
-  "version": "0.9.6",
+  "version": "0.11.0",
  "git_url": "https://github.com/qurator-spk/dinglehopper",
+  "dockerhub": "ocrd/dinglehopper",
  "tools": {
    "ocrd-dinglehopper": {
      "executable": "ocrd-dinglehopper",
+      "input_file_grp_cardinality": 2,
+      "output_file_grp_cardinality": 1,
      "description": "Evaluate OCR text against ground truth with dinglehopper",
-      "input_file_grp": [
-        "OCR-D-GT-PAGE",
-        "OCR-D-OCR"
-      ],
-      "output_file_grp": [
-        "OCR-D-OCR-EVAL"
-      ],
      "categories": [
        "Quality assurance"
      ],
@ -29,6 +25,11 @@
          "enum": ["region", "line"],
          "default": "region",
          "description": "PAGE XML hierarchy level to extract the text from"
+        },
+        "plain_encoding": {
+          "type": "string",
+          "default": "autodetect",
+          "description": "Encoding (e.g. \"utf-8\") of plain text files"
        }
      }
    }
--- a/src/dinglehopper/ocrd_cli.py
+++ b/src/dinglehopper/ocrd_cli.py
@ -1,78 +1,78 @@
-import json
+from functools import cached_property
 import os
+from typing import Optional

 import click
+from ocrd_models import OcrdFileType
 from ocrd import Processor
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
-from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
-from pkg_resources import resource_string
+from ocrd_utils import make_file_id

 from .cli import process as cli_process

-OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
-
-
@click.command()
@ocrd_cli_options
 def ocrd_dinglehopper(*args, **kwargs):
    return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)

-
 class OcrdDinglehopperEvaluate(Processor):
-    def __init__(self, *args, **kwargs):
-        kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
-        super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)

-    def process(self):
-        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
-        assert_file_grp_cardinality(self.output_file_grp, 1)
+    @cached_property
+    def executable(self):
+        return 'ocrd-dinglehopper'

-        log = getLogger("processor.OcrdDinglehopperEvaluate")
+    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:

+        assert self.parameter
        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
-        gt_grp, ocr_grp = self.input_file_grp.split(",")
+        plain_encoding = self.parameter["plain_encoding"]

-        input_file_tuples = self.zip_input_files(on_error="abort")
-        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
-            if not gt_file or not ocr_file:
-                # file/page was not found in this group
-                continue
-            gt_file = self.workspace.download_file(gt_file)
-            ocr_file = self.workspace.download_file(ocr_file)
-            page_id = gt_file.pageId
+        # wrong number of inputs: let fail
+        gt_file, ocr_file = input_files
+        # missing on either side: skip (zip_input_files already warned)
+        if not gt_file or not ocr_file:
+            return
+        # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
+        if not gt_file.local_filename:
+            if config.OCRD_MISSING_INPUT == 'ABORT':
+                raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
+            return
+        if not ocr_file.local_filename:
+            if config.OCRD_MISSING_INPUT == 'ABORT':
+                raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
+            return

-            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
+        page_id = gt_file.pageId

-            file_id = make_file_id(ocr_file, self.output_file_grp)
-            report_prefix = os.path.join(self.output_file_grp, file_id)
+        file_id = make_file_id(ocr_file, self.output_file_grp)
+        cli_process(
+            gt_file.local_filename,
+            ocr_file.local_filename,
+            file_id,
+            self.output_file_grp,
+            metrics=metrics,
+            textequiv_level=textequiv_level,
+            plain_encoding=plain_encoding,
+        )

-            # Process the files
-            try:
-                os.mkdir(self.output_file_grp)
-            except FileExistsError:
-                pass
-            cli_process(
-                gt_file.local_filename,
-                ocr_file.local_filename,
-                report_prefix,
-                metrics=metrics,
-                textequiv_level=textequiv_level,
+        # Add reports to the workspace
+        for report_suffix, mimetype in [
+            [".html", "text/html"],
+            [".json", "application/json"],
+        ]:
+            output_file_id = file_id + report_suffix
+            output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
+            if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
+                raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
+            self.workspace.add_file(
+               file_id=output_file_id,
+                file_grp=self.output_file_grp,
+                page_id=page_id,
+                mimetype=mimetype,
+                local_filename=file_id + report_suffix,
            )

-            # Add reports to the workspace
-            for report_suffix, mimetype in [
-                [".html", "text/html"],
-                [".json", "application/json"],
-            ]:
-                self.workspace.add_file(
-                    file_id=file_id + report_suffix,
-                    file_grp=self.output_file_grp,
-                    page_id=page_id,
-                    mimetype=mimetype,
-                    local_filename=report_prefix + report_suffix,
-                )
-

 if __name__ == "__main__":
    ocrd_dinglehopper()
--- a/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
@ -0,0 +1 @@
+This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
@ -0,0 +1 @@
+Another test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt
@ -0,0 +1 @@
+Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt
@ -0,0 +1 @@
+AnÖther test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
@ -0,0 +1 @@
+This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt
@ -0,0 +1 @@
+Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
@ -0,0 +1 @@
+Another test.
--- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt
@ -0,0 +1 @@
+AnÖther test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt
@ -0,0 +1 @@
+This is a test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt
@ -0,0 +1 @@
+Another test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt
@ -0,0 +1 @@
+Tis is a test.
--- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
+++ b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt
@ -0,0 +1 @@
+AnÖther test.
--- a/src/dinglehopper/tests/test_integ_cli_line_dirs.py
+++ b/src/dinglehopper/tests/test_integ_cli_line_dirs.py
@ -0,0 +1,61 @@
+import json
+import os.path
+import re
+
+import pytest
+
+from ..cli_line_dirs import process
+from .util import working_directory
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_basic_report_diff(tmp_path):
+    """Test that the cli/process() produces a report wiff char+word diff"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
+        ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
+        process(gt_dir, ocr_dir, "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+
+    # Counting GT lines in the diff
+    assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2
+    assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2
+
+
+@pytest.mark.integration
+def test_cli_line_dirs_merged(tmp_path):
+    """Test that the cli/process() produces a good report"""
+
+    with working_directory(tmp_path):
+        gt_dir = os.path.join(data_dir, "line_dirs/merged")
+        ocr_dir = os.path.join(data_dir, "line_dirs/merged")
+        process(
+            gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt"
+        )
+        with open("report.json", "r") as jsonf:
+            print(jsonf.read())
+        with open("report.json", "r") as jsonf:
+            j = json.load(jsonf)
+            assert j["cer"] == pytest.approx(0.1071429)
+            assert j["wer"] == pytest.approx(0.5)
--- a/src/dinglehopper/tests/test_integ_cli_valid_report.py
+++ b/src/dinglehopper/tests/test_integ_cli_valid_report.py
@ -1,4 +1,5 @@
 import json
+import re

 import pytest

@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path):
        with open("report.json", "r") as jsonf:
            j = json.load(jsonf)
            assert j["cer"] == pytest.approx(float("inf"))
+
+
+@pytest.mark.integration
+def test_cli_html(tmp_path):
+    """Test that the cli/process() yields complete HTML report"""
+
+    with working_directory(tmp_path):
+        with open("gt.txt", "w") as gtf:
+            gtf.write("AAAAA")
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write("AAAAB")
+
+        process("gt.txt", "ocr.txt", "report")
+
+        with open("report.html", "r") as htmlf:
+            html_report = htmlf.read()
+            print(html_report)
+
+        assert re.search(r"CER: 0\.\d+", html_report)
+        assert re.search(r"WER: 1\.0", html_report)
+        assert len(re.findall("gt.*cdiff", html_report)) == 1
+        assert len(re.findall("gt.*wdiff", html_report)) == 1
--- a/src/dinglehopper/tests/test_line_dirs.py
+++ b/src/dinglehopper/tests/test_line_dirs.py
@ -0,0 +1,71 @@
+import os
+
+from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect
+
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+
+def test_basic():
+    """Test the dumb method: User gives directories and suffixes."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            os.path.join(data_dir, "line_dirs/basic/gt"),
+            ".gt.txt",
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_basic_autodetect():
+    """Test autodetect: User gives directories, suffixes are autodetected if possible"""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            os.path.join(data_dir, "line_dirs/basic/gt"),
+            os.path.join(data_dir, "line_dirs/basic/ocr"),
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs():
+    """Test the dumb method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
+            ".gt.txt",
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_subdirs_autodetect():
+    """Test the autodetect method: Should also work when subdirectories are involved."""
+    pairs = list(
+        find_gt_and_ocr_files_autodetect(
+            os.path.join(data_dir, "line_dirs/subdirs/gt"),
+            os.path.join(data_dir, "line_dirs/subdirs/ocr"),
+        )
+    )
+
+    assert len(pairs) == 2
+
+
+def test_merged():
+    """Test the dumb method: GT and OCR texts are in the same directories."""
+    pairs = list(
+        find_gt_and_ocr_files(
+            os.path.join(data_dir, "line_dirs/merged"),
+            ".gt.txt",
+            os.path.join(data_dir, "line_dirs/merged"),
+            ".some-ocr.txt",
+        )
+    )
+
+    assert len(pairs) == 2
--- a/src/dinglehopper/tests/test_ocr_files.py
+++ b/src/dinglehopper/tests/test_ocr_files.py
@ -177,8 +177,20 @@ def test_text():
 def test_plain(tmp_path):
    with working_directory(tmp_path):
        with open("ocr.txt", "w") as ocrf:
-            ocrf.write("AAAAB")
+            ocrf.write("First, a line.\nAnd a second line.\n")

        result = plain_text("ocr.txt")
-        expected = "AAAAB"
+        expected = "First, a line.\nAnd a second line."
+        assert result == expected
+
+
+def test_plain_BOM(tmp_path):
+    """Test that plain text files with BOM are read correctly."""
+    BOM = "\ufeff"
+    with working_directory(tmp_path):
+        with open("ocr.txt", "w") as ocrf:
+            ocrf.write(BOM + "First, a line.\nAnd a second line.\n")
+
+        result = plain_text("ocr.txt")
+        expected = "First, a line.\nAnd a second line."
        assert result == expected
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@ -21,12 +21,17 @@ def patch_word_break():
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
    old_word_break = uniseg.wordbreak.word_break
+    if hasattr(uniseg.wordbreak, 'Word_Break'):
+        aletter = uniseg.wordbreak.Word_Break.ALetter
+    else:
+        # uniseg<0.9
+        aletter = uniseg.wordbreak.WordBreak.ALETTER

-    def new_word_break(c, index=0):
+    def new_word_break(c):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return uniseg.wordbreak.WordBreak.ALETTER
+            return aletter
        else:
-            return old_word_break(c, index)
+            return old_word_break(c)

    uniseg.wordbreak.word_break = new_word_break
    global word_break_patched
Author	SHA1	Message	Date
Mike Gerber	3443edd6d3	Merge pull request #145 from bertsky/master update docker	2025-05-13 12:41:50 +02:00
Robert Sachunsky	b1ef3af1a8	docker: use latest core base stage	2025-05-02 00:19:11 +02:00
Robert Sachunsky	d09e3969f8	docker: prepackage ocrd-all-module-dir.json	2025-05-02 00:19:11 +02:00
Mike Gerber	b5e99d96c9	Merge pull request #144 from qurator-spk/fix/make-test-results-clearer ✔ GitHub Actions: Make reporting results clearer	2025-04-25 11:31:29 +02:00
Mike Gerber	774790c36f	✔ GitHub Actions: Make reporting results clearer In the "Actions" tab on GitHub, the workflow run that would post test results to the _original_ workflow run is named "Test Report". This would lead me to click on it to see the results, just to be disappointed. This aims to make the naming of the GitHub workflows/jobs clearer.	2025-04-25 11:20:00 +02:00
Mike Gerber	addb572922	Merge pull request #143 from qurator-spk/chore/update-pre-commit ⚙ pre-commit: update	2025-04-25 10:14:30 +02:00
Mike Gerber	1ebb004386	⚙ pre-commit: update	2025-04-25 10:13:06 +02:00
Mike Gerber	c3aa48ec3b	Merge branch 'master' of https://github.com/qurator-spk/dinglehopper	2025-04-24 17:16:06 +02:00
Mike Gerber	628594ef98	📦 v0.11.0	2025-04-24 17:14:44 +02:00
Mike Gerber	d7814db705	Merge pull request #142 from qurator-spk/feat/flex-line-dirs Feat/flex line dirs	2025-04-24 16:48:22 +02:00
Mike Gerber	5639f3db7f	✔ Add a tests that checks if plain text files with BOM are read correctly	2025-04-24 16:44:29 +02:00
Mike Gerber	9fc8937324	✒ README: Mention dinglehopper-line-dirs --help	2025-04-24 15:13:19 +02:00
Mike Gerber	14a4bc56d8	🐛 Add --plain-encoding option to dinglehopper-extract	2025-04-22 18:24:35 +02:00
Mike Gerber	a70260c10e	🐛 Use warning() to fix DeprecationWarning	2025-04-22 13:57:19 +02:00
Gerber, Mike	224aa02163	🚧 Fix help text	2025-04-22 13:57:19 +02:00
Gerber, Mike	9db5b4caf5	🚧 Add OCR-D parameter for plain text encoding	2025-04-22 13:57:19 +02:00
Gerber, Mike	5578ce83a3	🚧 Add option for text encoding to line dir cli	2025-04-22 13:57:19 +02:00
Gerber, Mike	cf59b951a3	🚧 Add option for text encoding to line dir cli	2025-04-22 13:57:19 +02:00
Gerber, Mike	480b3cf864	✔ Test that CLI produces a complete HTML report	2025-04-22 13:57:19 +02:00
Gerber, Mike	f1a586cff1	✔ Test line dirs CLI	2025-04-22 13:57:18 +02:00
Gerber, Mike	3b16c14c16	✔ Properly test line dir finding	2025-04-22 13:57:18 +02:00
Gerber, Mike	322faeb26c	🎨 Sort imports	2025-04-22 13:57:18 +02:00
Gerber, Mike	c37316da09	🐛 cli_line_dirs: Fix word differences section At the time of generation of the section, the {gt,ocr}_words generators were drained. Fix by using a list. Fixes gh-124.	2025-04-22 13:57:18 +02:00
Gerber, Mike	9414a92f9f	🐛 cli_line_dirs: Type-annotate functions	2025-04-22 13:57:18 +02:00
Gerber, Mike	68344e48f8	🎨 Reformat cli_line_dirs	2025-04-22 13:57:18 +02:00
Gerber, Mike	73ee16fe51	🚧 Support 'merged' GT+OCR line directories	2025-04-22 13:57:18 +02:00
Gerber, Mike	6980d7a252	🚧 Use our own removesuffix() as we still support Python 3.8	2025-04-22 13:57:18 +02:00
Gerber, Mike	2bf2529c38	🚧 Port new line dir functions	2025-04-22 13:57:17 +02:00
Gerber, Mike	ad8e6de36b	🐛 cli_line_dirs: Fix character diff reports	2025-04-22 13:57:17 +02:00
Gerber, Mike	4024e350f7	🚧 Test new flexible line dirs functions	2025-04-22 13:57:17 +02:00
Mike Gerber	3c317cbeaf	Merge pull request #141 from qurator-spk/chore/update-pre-commit ⚙ pre-commit: update	2025-04-22 12:35:14 +02:00
Mike Gerber	d8403421fc	⚙ pre-commit: update	2025-04-22 12:30:47 +02:00
Mike Gerber	3305043234	Merge pull request #140 from qurator-spk/fix/vendor-strings 🐛 Fix vendor strings	2025-04-22 11:50:29 +02:00
Mike Gerber	6bf5bd7178	🐛 Fix vendor strings	2025-04-22 11:48:44 +02:00
Mike Gerber	817e0c95f7	📦 v0.10.1	2025-04-22 10:32:29 +02:00
Mike Gerber	3d7c7ee1e3	Merge pull request #139 from bertsky/allow-uniseg-py38 re-allow uniseg 0.8 and py38	2025-04-22 10:09:51 +02:00
Robert Sachunsky	a24623b966	re-allow py38	2025-04-17 16:47:13 +02:00
Robert Sachunsky	ea33602336	CI: reactivate py38	2025-04-17 16:12:42 +02:00
Robert Sachunsky	64444dd419	opt out of `7f8a8dd5` (uniseg update that requires py39)	2025-04-17 16:12:37 +02:00
Mike Gerber	f6dfb77f94	🐛 pyproject.toml: Fix description	2025-04-17 08:51:32 +02:00
Mike Gerber	ef817cb343	📦 v0.10.0	2025-04-17 08:37:37 +02:00
Mike Gerber	b1c109baae	Merge pull request #128 from kba/v3-api V3 api	2025-04-17 08:34:51 +02:00
Mike Gerber	13ab1ae150	🐛 Docker: Use same vendor as license for now	2025-04-17 08:26:36 +02:00
Mike Gerber	d974369e13	🐛 Docker: Fix description	2025-04-17 08:10:56 +02:00
Mike Gerber	b7bdca4ac8	🐛 Makefile: Make phony targets .PHONY	2025-04-17 08:09:06 +02:00
kba	831a24fc4c	typo: report_prefix -> file_id	2025-04-17 08:04:52 +02:00
Konstantin Baierer	f6a2c94520	ocrd_cli: but do check for existing output files Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2025-04-17 08:04:52 +02:00
Konstantin Baierer	4162836612	ocrd_cli: no need to check fileGrp dir exists Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2025-04-17 08:04:52 +02:00
Konstantin Baierer	c0aa82d188	OCR-D processor: properly handle missing or non-downloaded GT/OCR file Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2025-04-17 08:04:51 +02:00
kba	8c1b6d65f5	Dockerfile: build ocrd-all-tool.json	2025-04-17 08:04:51 +02:00
Mike Gerber	f287386c0e	🧹Don't pin uniseg and rapidfuzz Breakage with the newest uniseg API was fixed in master. Can't see any issue with rapidfuzz, so removing that pin, too.	2025-04-16 14:49:23 +02:00
kba	63031b30bf	Port to OCR-D/core API v3	2025-04-16 14:45:16 +02:00
Mike Gerber	bf6633be02	Merge pull request #136 from qurator-spk/chore/update-liccheck ⚙ liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl)	2025-04-16 11:13:02 +02:00
Mike Gerber	d3aa9eb520	⚙ liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl)	2025-04-16 11:09:33 +02:00
Mike Gerber	625686f204	Merge pull request #135 from qurator-spk/chore/update-python-version ⚙ pyproject.toml: Update supported Python version	2025-04-16 11:01:09 +02:00
Mike Gerber	ce7886af23	⚙ pyproject.toml: Update supported Python version	2025-04-16 10:57:10 +02:00
Mike Gerber	a09a624bde	Merge pull request #132 from qurator-spk/fix/uniseg-removed-index-parameter 🐛 Fix for changed API of uniseg's word_break	2025-04-16 09:28:31 +02:00
Mike Gerber	badfa9c99e	⚙ GitHub Actions: Don't test on Python 3.8 anymore	2025-04-16 09:25:44 +02:00
Mike Gerber	7f8a8dd564	🐛 Fix for changed API of uniseg's word_break	2025-04-16 09:10:43 +02:00
Mike Gerber	b72d4f5af9	Merge pull request #131 from qurator-spk/chore/update-pre-commit ⚙ pre-commit: update	2025-04-16 09:06:05 +02:00
Mike Gerber	058042accb	⚙ pre-commit: update	2025-04-16 08:59:58 +02:00
Mike Gerber	071e6a8bd1	Merge pull request #120 from joschrew/dockerfile Add Dockerfile and Makefile to create ocr-d dockerimage	2024-10-11 18:04:22 +02:00
Mike Gerber	6b82293670	Update Dockerfile I fancy-clicked @bertsky's change suggestion, which duplicated some labels. Now fancy-clicking the fix, fingers crossed...	2024-10-07 17:41:59 +02:00
Mike Gerber	6ecf49a355	Update Dockerfile Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>	2024-10-07 17:39:42 +02:00
joschrew	9c7c104dce	Add Dockerfile and Makefile to create ocr-d image	2024-10-02 15:29:36 +02:00
Mike Gerber	2e6fe0c279	Merge pull request #113 from qurator-spk/python-3.13 ✔ Test on Python 3.13	2024-09-04 19:15:04 +02:00
Mike Gerber	1753ed4d13	✔ Test on Python 3.13	2024-09-04 19:09:45 +02:00
Mike Gerber	3233dbcc8f	✔ pre-commit: Add license check	2024-07-22 16:54:33 +02:00
Mike Gerber	f2e290dffe	🐛 Fix --version option in OCR-D CLI	2024-07-19 14:54:46 +02:00
Mike Gerber	6d1daf1dfe	✨ Support --version option in CLI	2024-07-19 14:41:54 +02:00
Mike Gerber	27ad145c7e	⚙ pyproject.toml: Add license.file	2024-07-19 09:58:01 +02:00
Mike Gerber	2e9e88cc1e	⚙ pre-commit: Update hooks	2024-07-19 09:56:40 +02:00
Mike Gerber	129e6eb427	📦 v0.9.7	2024-07-11 17:25:38 +02:00
Mike Gerber	cf998443c1	⚙ ruff: Update settings (select → lint.select)	2024-07-11 17:15:24 +02:00
Mike Gerber	6048107889	Merge branch 'master' of https://github.com/qurator-spk/dinglehopper	2024-07-11 16:26:29 +02:00
Mike Gerber	2ee37ed4e3	🎨 Sort imports	2024-07-11 16:25:38 +02:00
Mike Gerber	521f034fba	Merge pull request #116 from stweil/master Fix typo	2024-07-10 01:13:24 +02:00
Mike Gerber	d1a2247615	⚙ pre-commit: Update hooks	2024-07-09 21:07:59 +02:00
Mike Gerber	4047f8b6e5	🐛 Fix loading ocrd-tool.json for Python 3.12	2024-07-09 21:01:31 +02:00
Stefan Weil	cd68a973cb	Fix typo Signed-off-by: Stefan Weil <sw@weilnetz.de>	2024-05-26 09:18:00 +02:00
Mike Gerber	bc5818da9f	✔ GitHub Actions: Update used actions	2024-05-14 15:56:08 +02:00
Mike Gerber	c91234daba	✔ GitHub Actions: Update used actions	2024-05-13 21:17:42 +02:00
Mike Gerber	a534b5e28e	⚙ pre-commit: Update hooks	2024-05-13 21:16:29 +02:00
Mike Gerber	b336f98271	🐛 Fix reading plain text files As reported by @tallemeersch in gh-107, newlines were not removed for plain text files. Fix this by stripping the lines as suggested. Fixes gh-107.	2024-05-06 18:14:16 +02:00