Compare commits

..

No commits in common. 'master' and 'v0.9.6' have entirely different histories.

@ -1,5 +0,0 @@
src/dinglehopper/tests
dist
build
*.egg-info
.git

@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Upgrade pip - name: Upgrade pip
run: python3 -m pip install --upgrade pip run: python3 -m pip install --upgrade pip
- name: Install setuptools - name: Install setuptools
@ -32,7 +32,7 @@ jobs:
- name: Build package - name: Build package
run: python3 -m pip install --upgrade build && python3 -m build run: python3 -m pip install --upgrade build && python3 -m build
- name: Upload dist - name: Upload dist
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v3
with: with:
name: dist name: dist
path: dist/ path: dist/
@ -42,7 +42,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Download dist - name: Download dist
uses: actions/download-artifact@v4 uses: actions/download-artifact@v3
with: with:
name: dist name: dist
path: dist/ path: dist/
@ -61,7 +61,7 @@ jobs:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps: steps:
- name: Download dist - name: Download dist
uses: actions/download-artifact@v4 uses: actions/download-artifact@v3
with: with:
name: dist name: dist
path: dist/ path: dist/

@ -25,19 +25,18 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
runs-on: "ubuntu-latest" runs-on: "ubuntu-latest"
steps: steps:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
allow-prereleases: true
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Install possible lxml build requirements (if building from source) - name: Install possible lxml build requirements (if building from source)
run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
@ -57,7 +56,7 @@ jobs:
cd src cd src
python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
- name: Upload test results - name: Upload test results
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v3
if: success() || failure() if: success() || failure()
with: with:
name: test-results-${{matrix.python-version}} name: test-results-${{matrix.python-version}}

@ -12,7 +12,7 @@ jobs:
report: report:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dorny/test-reporter@v1 - uses: dorny/test-reporter@v1.7.0
with: with:
artifact: /test-results-(.*)/ artifact: /test-results-(.*)/
name: 'Tests Results - $1' name: 'Tests Results - $1'

@ -1,6 +1,6 @@
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0 rev: v4.6.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer - id: end-of-file-fixer
@ -11,12 +11,12 @@ repos:
- id: check-ast - id: check-ast
- repo: https://github.com/psf/black - repo: https://github.com/psf/black
rev: 25.1.0 rev: 24.4.2
hooks: hooks:
- id: black - id: black
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5 rev: v0.4.3
hooks: hooks:
- args: - args:
- --fix - --fix
@ -24,7 +24,7 @@ repos:
id: ruff id: ruff
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.15.0 rev: v1.10.0
hooks: hooks:
- additional_dependencies: - additional_dependencies:
- types-setuptools - types-setuptools
@ -36,12 +36,6 @@ repos:
id: mypy id: mypy
- repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
rev: v0.6.1 rev: v0.3.1post2
hooks: hooks:
- id: pre-commit-update - id: pre-commit-update
- repo: https://github.com/dhatim/python-license-check
rev: 0.9.2
hooks:
- id: liccheck
language: system

@ -1,38 +0,0 @@
ARG DOCKER_BASE_IMAGE
FROM $DOCKER_BASE_IMAGE
ARG VCS_REF
ARG BUILD_DATE
LABEL \
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
org.label-schema.build-date=$BUILD_DATE \
org.opencontainers.image.vendor="qurator" \
org.opencontainers.image.title="dinglehopper" \
org.opencontainers.image.description="An OCR evaluation tool" \
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
org.opencontainers.image.revision=$VCS_REF \
org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.base.name=ocrd/core
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
# avoid the need for an extra volume for persistent resource user db
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
WORKDIR /build/dinglehopper
COPY . .
COPY ocrd-tool.json .
# prepackage ocrd-tool.json as ocrd-all-tool.json
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
RUN make install && rm -rf /build/dinglehopper
WORKDIR /data
VOLUME /data

@ -1,33 +0,0 @@
PYTHON = python3
PIP = pip3
PYTHONIOENCODING=utf8
PYTEST_ARGS = -vv
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
DOCKER_TAG = ocrd/dinglehopper
help:
@echo
@echo " Targets"
@echo
@echo " install Install full Python package via pip"
@echo " docker Build the ocrd/dinglehopper docker image"
# Install Python package via pip
install:
$(PIP) install .
install-dev:
$(PIP) install -e .
test:
pytest $(PYTEST_ARGS)
docker:
docker build \
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .
.PHONY: help install install-dev test docker

@ -7,10 +7,9 @@ authors = [
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
] ]
description = "An OCR evaluation tool" description = "The OCR evaluation tool"
readme = "README.md" readme = "README.md"
license.file = "LICENSE" requires-python = ">=3.8"
requires-python = ">=3.9"
keywords = ["qurator", "ocr", "evaluation", "ocr-d"] keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
dynamic = ["version", "dependencies", "optional-dependencies"] dynamic = ["version", "dependencies", "optional-dependencies"]
@ -49,7 +48,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
where = ["src"] where = ["src"]
[tool.setuptools.package-data] [tool.setuptools.package-data]
dinglehopper = ["templates/*", "*.json"] dinglehopper = ["templates/*"]
[tool.pytest.ini_options] [tool.pytest.ini_options]
@ -75,40 +74,5 @@ disallow_untyped_defs = false
disallow_untyped_calls = false disallow_untyped_calls = false
[tool.ruff.lint] [tool.ruff]
select = ["E", "F", "I"] select = ["E", "F", "I"]
[tool.liccheck]
authorized_licenses = [
"bsd",
"new bsd",
"bsd license",
"new bsd license",
"simplified bsd",
"apache",
"apache 2.0",
"apache software license",
"apache software",
"apache license 2.0",
"gnu lgpl",
"lgpl with exceptions or zpl",
"GNU Library or Lesser General Public License (LGPL)",
"GNU Lesser General Public License v3 (LGPLv3)",
"GNU Lesser General Public License v2 or later (LGPLv2+)",
"mit",
"mit license",
"mit-cmu",
"python software foundation",
"psf",
"psf-2.0",
"Historical Permission Notice and Disclaimer (HPND)",
"public domain",
'The Unlicense (Unlicense)',
"isc",
"ISC License (ISCL)",
'Mozilla Public License 2.0 (MPL 2.0)',
]
unauthorized_licenses = [
"gpl v3",
]

@ -10,5 +10,3 @@ mypy
types-lxml types-lxml
types-setuptools types-setuptools
pytest-mypy pytest-mypy
liccheck

@ -1,14 +1,13 @@
click click
jinja2 jinja2
lxml lxml
uniseg >= 0.9.1 uniseg >= 0.8.0
numpy numpy
colorama colorama
MarkupSafe MarkupSafe
ocrd >= 3.3.0 ocrd >= 2.65.0
attrs attrs
multimethod >= 1.3 multimethod >= 1.3
tqdm tqdm
rapidfuzz >= 2.7.0 rapidfuzz >= 2.7.0
chardet chardet
importlib_resources

@ -234,7 +234,6 @@ def process_dir(
metavar="LEVEL", metavar="LEVEL",
) )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
@click.version_option()
def main( def main(
gt, gt,
ocr, ocr,

@ -149,7 +149,7 @@ class ExtractedText:
raise ValueError("Can't have joiner without segments to join") raise ValueError("Can't have joiner without segments to join")
if self.segments is not None: if self.segments is not None:
if value not in ("", " ", "\n"): if value not in ("", " ", "\n"):
raise ValueError(f"Unexpected segment joiner value {repr(value)}") raise ValueError(f"Unexcepted segment joiner value {repr(value)}")
@_text.validator @_text.validator
def is_valid_text(self, _, value): def is_valid_text(self, _, value):

@ -36,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get("ID") line_id = line.attrib.get("ID")
line_text = " ".join( line_text = " ".join(
string.attrib.get("CONTENT", "") string.attrib.get("CONTENT")
for string in line.iterfind("alto:String", namespaces=nsmap) for string in line.iterfind("alto:String", namespaces=nsmap)
) )
normalized_text = normalize_sbb(line_text) normalized_text = normalize_sbb(line_text)
@ -167,7 +167,7 @@ def plain_extract(filename, include_filename_in_id=False):
with open(filename, "r", encoding=fileencoding) as f: with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText( return ExtractedText(
None, None,
[make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], [make_segment(no, line) for no, line in enumerate(f.readlines())],
"\n", "\n",
None, None,
None, None,

@ -1,13 +1,17 @@
{ {
"version": "0.10.0", "version": "0.9.6",
"git_url": "https://github.com/qurator-spk/dinglehopper", "git_url": "https://github.com/qurator-spk/dinglehopper",
"dockerhub": "ocrd/dinglehopper",
"tools": { "tools": {
"ocrd-dinglehopper": { "ocrd-dinglehopper": {
"executable": "ocrd-dinglehopper", "executable": "ocrd-dinglehopper",
"input_file_grp_cardinality": 2,
"output_file_grp_cardinality": 1,
"description": "Evaluate OCR text against ground truth with dinglehopper", "description": "Evaluate OCR text against ground truth with dinglehopper",
"input_file_grp": [
"OCR-D-GT-PAGE",
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-OCR-EVAL"
],
"categories": [ "categories": [
"Quality assurance" "Quality assurance"
], ],

@ -1,76 +1,78 @@
from functools import cached_property import json
import os import os
from typing import Optional
import click import click
from ocrd_models import OcrdFileType
from ocrd import Processor from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import make_file_id from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
from pkg_resources import resource_string
from .cli import process as cli_process from .cli import process as cli_process
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@click.command() @click.command()
@ocrd_cli_options @ocrd_cli_options
def ocrd_dinglehopper(*args, **kwargs): def ocrd_dinglehopper(*args, **kwargs):
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
class OcrdDinglehopperEvaluate(Processor): class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
@cached_property def process(self):
def executable(self): assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
return 'ocrd-dinglehopper' assert_file_grp_cardinality(self.output_file_grp, 1)
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: log = getLogger("processor.OcrdDinglehopperEvaluate")
assert self.parameter
metrics = self.parameter["metrics"] metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"] textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",")
input_file_tuples = self.zip_input_files(on_error="abort")
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
if not gt_file or not ocr_file:
# file/page was not found in this group
continue
gt_file = self.workspace.download_file(gt_file)
ocr_file = self.workspace.download_file(ocr_file)
page_id = gt_file.pageId
# wrong number of inputs: let fail log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file)
gt_file, ocr_file = input_files
# missing on either side: skip (zip_input_files already warned) file_id = make_file_id(ocr_file, self.output_file_grp)
if not gt_file or not ocr_file: report_prefix = os.path.join(self.output_file_grp, file_id)
return
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false): # Process the files
if not gt_file.local_filename: try:
if config.OCRD_MISSING_INPUT == 'ABORT': os.mkdir(self.output_file_grp)
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) except FileExistsError:
return pass
if not ocr_file.local_filename: cli_process(
if config.OCRD_MISSING_INPUT == 'ABORT': gt_file.local_filename,
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) ocr_file.local_filename,
return report_prefix,
metrics=metrics,
page_id = gt_file.pageId textequiv_level=textequiv_level,
file_id = make_file_id(ocr_file, self.output_file_grp)
cli_process(
gt_file.local_filename,
ocr_file.local_filename,
file_id,
self.output_file_grp,
metrics=metrics,
textequiv_level=textequiv_level,
)
# Add reports to the workspace
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
output_file_id = file_id + report_suffix
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
self.workspace.add_file(
file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=file_id + report_suffix,
) )
# Add reports to the workspace
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
self.workspace.add_file(
file_id=file_id + report_suffix,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix,
)
if __name__ == "__main__": if __name__ == "__main__":
ocrd_dinglehopper() ocrd_dinglehopper()

@ -177,8 +177,8 @@ def test_text():
def test_plain(tmp_path): def test_plain(tmp_path):
with working_directory(tmp_path): with working_directory(tmp_path):
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("First, a line.\nAnd a second line.\n") ocrf.write("AAAAB")
result = plain_text("ocr.txt") result = plain_text("ocr.txt")
expected = "First, a line.\nAnd a second line." expected = "AAAAB"
assert result == expected assert result == expected

@ -22,11 +22,11 @@ def patch_word_break():
""" """
old_word_break = uniseg.wordbreak.word_break old_word_break = uniseg.wordbreak.word_break
def new_word_break(c): def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
return uniseg.wordbreak.Word_Break.ALetter return uniseg.wordbreak.WordBreak.ALETTER
else: else:
return old_word_break(c) return old_word_break(c, index)
uniseg.wordbreak.word_break = new_word_break uniseg.wordbreak.word_break = new_word_break
global word_break_patched global word_break_patched

Loading…
Cancel
Save