diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index a8312db..0000000 --- a/.dockerignore +++ /dev/null @@ -1,5 +0,0 @@ -src/dinglehopper/tests -dist -build -*.egg-info -.git diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3f51bd7..8c193df 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v3 - name: Upgrade pip run: python3 -m pip install --upgrade pip - name: Install setuptools @@ -32,7 +32,7 @@ jobs: - name: Build package run: python3 -m pip install --upgrade build && python3 -m build - name: Upload dist - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: dist path: dist/ @@ -42,7 +42,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download dist - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v3 with: name: dist path: dist/ @@ -61,7 +61,7 @@ jobs: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - name: Download dist - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v3 with: name: dist path: dist/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index db089d0..f049c2c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: 'Test' +name: Test on: @@ -25,19 +25,18 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] runs-on: "ubuntu-latest" steps: - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - allow-prereleases: true - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v3 - name: Install possible lxml build requirements (if building from source) run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev @@ -57,7 +56,7 @@ jobs: cd src python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy - name: Upload test results - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: success() || failure() with: name: test-results-${{matrix.python-version}} diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml index 5579d8c..908a593 100644 --- a/.github/workflows/test_report.yml +++ b/.github/workflows/test_report.yml @@ -1,4 +1,4 @@ -name: 'Test - Report results' +name: 'Test Report' on: workflow_run: workflows: ['test'] @@ -12,9 +12,9 @@ jobs: report: runs-on: ubuntu-latest steps: - - uses: dorny/test-reporter@v1 + - uses: dorny/test-reporter@v1.7.0 with: artifact: /test-results-(.*)/ - name: 'test - Results ($1)' + name: 'Tests Results - $1' path: '*junit.xml' reporter: java-junit diff --git a/.gitignore b/.gitignore index 66d66bc..d931831 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ dmypy.json # User-specific stuff .idea -.*.swp # Build artifacts /build diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 345060d..640db3b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -11,12 +11,12 @@ repos: - id: check-ast - repo: https://github.com/psf/black - rev: 25.1.0 + rev: 24.4.2 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.7 + rev: v0.4.3 hooks: - args: - --fix @@ -24,7 +24,7 @@ repos: id: ruff - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.15.0 + rev: v1.10.0 hooks: - additional_dependencies: - types-setuptools @@ -36,12 +36,6 @@ repos: id: mypy - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update - rev: v0.6.1 + rev: v0.3.1post2 hooks: - id: pre-commit-update - -- repo: https://github.com/dhatim/python-license-check - rev: 0.9.2 - hooks: - - id: liccheck - language: system diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 7064efc..0000000 --- a/Dockerfile +++ /dev/null @@ -1,40 +0,0 @@ -ARG DOCKER_BASE_IMAGE -FROM $DOCKER_BASE_IMAGE -ARG VCS_REF -ARG BUILD_DATE -LABEL \ - maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ - org.label-schema.build-date=$BUILD_DATE \ - org.opencontainers.image.vendor="Staatsbibliothek zu Berlin — SPK" \ - org.opencontainers.image.title="dinglehopper" \ - org.opencontainers.image.description="An OCR evaluation tool" \ - org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ - org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ - org.opencontainers.image.revision=$VCS_REF \ - org.opencontainers.image.created=$BUILD_DATE \ - org.opencontainers.image.base.name=ocrd/core - -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 - -# avoid HOME/.local/share (hard to predict USER here) -# so let XDG_DATA_HOME coincide with fixed system location -# (can still be overridden by derived stages) -ENV XDG_DATA_HOME /usr/local/share -# avoid the need for an extra volume for persistent resource user db -# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) -ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources - -WORKDIR /build/dinglehopper -COPY . . -COPY ocrd-tool.json . -# prepackage ocrd-tool.json as ocrd-all-tool.json -RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json -# prepackage ocrd-all-module-dir.json -RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json -RUN make install && rm -rf /build/dinglehopper - -WORKDIR /data -VOLUME /data diff --git a/LICENSE b/LICENSE index 221c706..9b7a833 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2019-2025 Staatsbibliothek zu Berlin — SPK + Copyright 2019 qurator Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Makefile b/Makefile deleted file mode 100644 index 3729311..0000000 --- a/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -PYTHON = python3 -PIP = pip3 -PYTHONIOENCODING=utf8 -PYTEST_ARGS = -vv - -DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest -DOCKER_TAG ?= ocrd/dinglehopper -DOCKER ?= docker - -help: - @echo - @echo " Targets" - @echo - @echo " install Install full Python package via pip" - @echo " docker Build the ocrd/dinglehopper docker image" - -# Install Python package via pip -install: - $(PIP) install . - -install-dev: - $(PIP) install -e . - -test: - pytest $(PYTEST_ARGS) - -docker: - $(DOCKER) build \ - --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ - --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ - --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - -t $(DOCKER_TAG) . - -.PHONY: help install install-dev test docker diff --git a/README.md b/README.md index a40db79..76fcc5a 100644 --- a/README.md +++ b/README.md @@ -112,13 +112,9 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt. with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate CLI interface: -``` +~~~ dinglehopper-line-dirs gt/ ocr/ -``` - -The CLI `dinglehopper-line-dirs` can also work with GT text files in the same -directories as the the OCR text files. You should read `dinglehopper-line-dirs --help` -in this case. +~~~ ### dinglehopper-extract The tool `dinglehopper-extract` extracts the text of the given input file on diff --git a/pyproject.toml b/pyproject.toml index 62fae82..41d45ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,9 +7,8 @@ authors = [ {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, ] -description = "An OCR evaluation tool" +description = "The OCR evaluation tool" readme = "README.md" -license.file = "LICENSE" requires-python = ">=3.8" keywords = ["qurator", "ocr", "evaluation", "ocr-d"] @@ -49,7 +48,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]} where = ["src"] [tool.setuptools.package-data] -dinglehopper = ["templates/*", "*.json"] +dinglehopper = ["templates/*"] [tool.pytest.ini_options] @@ -75,40 +74,5 @@ disallow_untyped_defs = false disallow_untyped_calls = false -[tool.ruff.lint] +[tool.ruff] select = ["E", "F", "I"] - - -[tool.liccheck] -authorized_licenses = [ - "bsd", - "new bsd", - "bsd license", - "new bsd license", - "simplified bsd", - "apache", - "apache 2.0", - "apache software license", - "apache software", - "apache license 2.0", - "gnu lgpl", - "lgpl with exceptions or zpl", - "GNU Library or Lesser General Public License (LGPL)", - "GNU Lesser General Public License v3 (LGPLv3)", - "GNU Lesser General Public License v2 or later (LGPLv2+)", - "mit", - "mit license", - "mit-cmu", - "python software foundation", - "psf", - "psf-2.0", - "Historical Permission Notice and Disclaimer (HPND)", - "public domain", - 'The Unlicense (Unlicense)', - "isc", - "ISC License (ISCL)", - 'Mozilla Public License 2.0 (MPL 2.0)', -] -unauthorized_licenses = [ - "gpl v3", -] diff --git a/requirements-dev.txt b/requirements-dev.txt index f9f748a..16ae880 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,5 +10,3 @@ mypy types-lxml types-setuptools pytest-mypy - -liccheck diff --git a/requirements.txt b/requirements.txt index 653ec59..846d389 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,9 @@ uniseg >= 0.8.0 numpy colorama MarkupSafe -ocrd >= 3.3.0 +ocrd >= 2.65.0 attrs multimethod >= 1.3 tqdm rapidfuzz >= 2.7.0 chardet -importlib_resources diff --git a/src/dinglehopper/cli.py b/src/dinglehopper/cli.py index 2d3c075..78ac33c 100644 --- a/src/dinglehopper/cli.py +++ b/src/dinglehopper/cli.py @@ -114,7 +114,6 @@ def process( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", - plain_encoding: str = "autodetect", ) -> None: """Check OCR result against GT. @@ -122,12 +121,8 @@ def process( this undecorated version and use Click on a wrapper. """ - gt_text = extract( - gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding - ) - ocr_text = extract( - ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding - ) + gt_text = extract(gt, textequiv_level=textequiv_level) + ocr_text = extract(ocr, textequiv_level=textequiv_level) gt_words: List[str] = list(words_normalized(gt_text)) ocr_words: List[str] = list(words_normalized(ocr_text)) @@ -200,7 +195,6 @@ def process_dir( metrics: bool = True, differences: bool = False, textequiv_level: str = "region", - plain_encoding: str = "autodetect", ) -> None: for gt_file in os.listdir(gt): gt_file_path = os.path.join(gt, gt_file) @@ -215,7 +209,6 @@ def process_dir( metrics=metrics, differences=differences, textequiv_level=textequiv_level, - plain_encoding=plain_encoding, ) else: print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) @@ -240,13 +233,7 @@ def process_dir( help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) -@click.option( - "--plain-encoding", - default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', -) @click.option("--progress", default=False, is_flag=True, help="Show progress bar") -@click.version_option() def main( gt, ocr, @@ -255,7 +242,6 @@ def main( metrics, differences, textequiv_level, - plain_encoding, progress, ): """ @@ -293,7 +279,6 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, - plain_encoding=plain_encoding, ) else: process( @@ -304,7 +289,6 @@ def main( metrics=metrics, differences=differences, textequiv_level=textequiv_level, - plain_encoding=plain_encoding, ) diff --git a/src/dinglehopper/cli_extract.py b/src/dinglehopper/cli_extract.py index 5fce032..9c51d34 100644 --- a/src/dinglehopper/cli_extract.py +++ b/src/dinglehopper/cli_extract.py @@ -12,12 +12,7 @@ from .ocr_files import extract help="PAGE TextEquiv level to extract text from", metavar="LEVEL", ) -@click.option( - "--plain-encoding", - default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', -) -def main(input_file, textequiv_level, plain_encoding): +def main(input_file, textequiv_level): """ Extract the text of the given INPUT_FILE. @@ -28,9 +23,7 @@ def main(input_file, textequiv_level, plain_encoding): use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() - input_text = extract( - input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding - ).text + input_text = extract(input_file, textequiv_level=textequiv_level).text print(input_text) diff --git a/src/dinglehopper/cli_line_dirs.py b/src/dinglehopper/cli_line_dirs.py index 0160f87..03bf374 100644 --- a/src/dinglehopper/cli_line_dirs.py +++ b/src/dinglehopper/cli_line_dirs.py @@ -1,6 +1,5 @@ import itertools import os -from typing import Callable, Iterator, List, Optional, Tuple import click from jinja2 import Environment, FileSystemLoader @@ -13,41 +12,6 @@ from .ocr_files import plain_extract from .word_error_rate import word_error_rate_n, words_normalized -def removesuffix(text, suffix): - """ - Remove suffix from text. - - Can be replaced with str.removesuffix when we only support Python >= 3.9. - """ - if suffix and text.endswith(suffix): - return text[: -len(suffix)] - return text - - -def is_hidden(filepath): - filename = os.path.basename(os.path.abspath(filepath)) - return filename.startswith(".") - - -def find_all_files( - dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False -) -> Iterator[str]: - """ - Find all files in dir_, returning filenames - - If pred is given, pred(filename) must be True for the filename. - - Does not return hidden files by default. - """ - for root, _, filenames in os.walk(dir_): - for fn in filenames: - if not return_hidden and is_hidden(fn): - continue - if pred and not pred(fn): - continue - yield os.path.join(root, fn) - - def all_equal(iterable): g = itertools.groupby(iterable) return next(g, True) and not next(g, False) @@ -61,63 +25,15 @@ def common_suffix(its): return reversed(common_prefix(reversed(it) for it in its)) -def find_gt_and_ocr_files( - gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str -) -> Iterator[Tuple[str, str]]: - """ - Find GT files and matching OCR files. - - Returns pairs of GT and OCR files. - """ - for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): - ocr_fn = os.path.join( - ocr_dir, - removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix, - ) - if not os.path.exists(ocr_fn): - raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") - - yield gt_fn, ocr_fn +def removesuffix(text, suffix): + if suffix and text.endswith(suffix): + return text[: -len(suffix)] + return text -def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): - """ - Find GT files and matching OCR files, autodetect suffixes. - - This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) - files with a common suffix. Currently the files must have a suffix, e.g. - ".gt.txt" (e.g. ".ocr.txt"). - - Returns pairs of GT and OCR files. - """ - - # Autodetect suffixes - gt_files = find_all_files(gt_dir) - gt_suffix = "".join(common_suffix(gt_files)) - if len(gt_suffix) == 0: - raise RuntimeError( - f"Files in GT directory {gt_dir} do not have a common suffix" - ) - ocr_files = find_all_files(ocr_dir) - ocr_suffix = "".join(common_suffix(ocr_files)) - if len(ocr_suffix) == 0: - raise RuntimeError( - f"Files in OCR directory {ocr_dir} do not have a common suffix" - ) - - yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) - - -def process( - gt_dir, - ocr_dir, - report_prefix, - *, - metrics=True, - gt_suffix=None, - ocr_suffix=None, - plain_encoding="autodetect", -): +def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): + gt_suffix = "".join(common_suffix(os.listdir(gt_dir))) + ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir))) cer = None n_characters = None @@ -126,20 +42,16 @@ def process( n_words = None word_diff_report = "" - if gt_suffix is not None and ocr_suffix is not None: - gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) - else: - gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) + for k, gt in enumerate(os.listdir(gt_dir)): + # Find a match by replacing the suffix + ocr = removesuffix(gt, gt_suffix) + ocr_suffix - for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): - gt_text = plain_extract( - gt_fn, include_filename_in_id=True, encoding=plain_encoding - ) + gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) ocr_text = plain_extract( - ocr_fn, include_filename_in_id=True, encoding=plain_encoding + os.path.join(ocr_dir, ocr), include_filename_in_id=True ) - gt_words: List[str] = list(words_normalized(gt_text)) - ocr_words: List[str] = list(words_normalized(ocr_text)) + gt_words = words_normalized(gt_text) + ocr_words = words_normalized(ocr_text) # Compute CER l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) @@ -169,7 +81,7 @@ def process( joiner="", none="·", score_hint=score_hint(l_cer, l_n_characters), - )[0] + ) word_diff_report += gen_diff_report( gt_words, ocr_words, @@ -177,7 +89,7 @@ def process( joiner=" ", none="⋯", score_hint=score_hint(l_wer, l_n_words), - )[0] + ) env = Environment( loader=FileSystemLoader( @@ -211,30 +123,17 @@ def process( @click.option( "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" ) -@click.option("--gt-suffix", help="Suffix of GT line text files") -@click.option("--ocr-suffix", help="Suffix of OCR line text files") -@click.option( - "--plain-encoding", - default="autodetect", - help='Encoding (e.g. "utf-8") of plain text files', -) -def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): +def main(gt, ocr, report_prefix, metrics): """ Compare the GT line text directory against the OCR line text directory. This assumes that the GT line text directory contains textfiles with a common suffix like ".gt.txt", and the OCR line text directory contains textfiles with a common suffix like ".some-ocr.txt". The text files also need to be paired, - i.e. the GT filename "line001.gt.txt" needs to match a filename - "line001.some-ocr.txt" in the OCR lines directory. + i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" + in the OCT lines directory. - GT and OCR directories may contain line text files in matching subdirectories, - e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt". - - GT and OCR directories can also be the same directory, but in this case you need - to give --gt-suffix and --ocr-suffix explicitly. - - The GT and OCR directories are usually ground truth line texts and the results of + The GT and OCR directories are usually round truth line texts and the results of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. @@ -243,19 +142,9 @@ def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding) $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). - It is recommended to specify the encoding of the text files, for example with - --plain-encoding utf-8. If this option is not given, we try to auto-detect it. """ initLogging() - process( - gt, - ocr, - report_prefix, - metrics=metrics, - gt_suffix=gt_suffix, - ocr_suffix=ocr_suffix, - plain_encoding=plain_encoding, - ) + process(gt, ocr, report_prefix, metrics=metrics) if __name__ == "__main__": diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py index acfbf78..6dcf0a7 100644 --- a/src/dinglehopper/extracted_text.py +++ b/src/dinglehopper/extracted_text.py @@ -149,7 +149,7 @@ class ExtractedText: raise ValueError("Can't have joiner without segments to join") if self.segments is not None: if value not in ("", " ", "\n"): - raise ValueError(f"Unexpected segment joiner value {repr(value)}") + raise ValueError(f"Unexcepted segment joiner value {repr(value)}") @_text.validator def is_valid_text(self, _, value): diff --git a/src/dinglehopper/ocr_files.py b/src/dinglehopper/ocr_files.py index fdcaf54..0c4fa04 100644 --- a/src/dinglehopper/ocr_files.py +++ b/src/dinglehopper/ocr_files.py @@ -5,13 +5,10 @@ from typing import Dict, Iterator, Optional import chardet from lxml import etree as ET from lxml.etree import XMLSyntaxError -from ocrd_utils import getLogger from uniseg.graphemecluster import grapheme_clusters from .extracted_text import ExtractedText, normalize_sbb -log = getLogger("processor.OcrdDinglehopperEvaluate") - def alto_namespace(tree: ET._ElementTree) -> Optional[str]: """Return the ALTO namespace used in the given ElementTree. @@ -39,7 +36,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]: for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): line_id = line.attrib.get("ID") line_text = " ".join( - string.attrib.get("CONTENT", "") + string.attrib.get("CONTENT") for string in line.iterfind("alto:String", namespaces=nsmap) ) normalized_text = normalize_sbb(line_text) @@ -152,7 +149,7 @@ def detect_encoding(filename): return chardet.detect(open(filename, "rb").read(1024))["encoding"] -def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"): +def plain_extract(filename, include_filename_in_id=False): id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" def make_segment(no, line): @@ -166,18 +163,11 @@ def plain_extract(filename, include_filename_in_id=False, encoding="autodetect") clusters, ) - if encoding == "autodetect": - fileencoding = detect_encoding(filename) - log.warning( - f"Autodetected encoding as '{fileencoding}'" - ", it is recommended to specify it explicitly with --plain-encoding" - ) - else: - fileencoding = encoding + fileencoding = detect_encoding(filename) with open(filename, "r", encoding=fileencoding) as f: return ExtractedText( None, - [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], + [make_segment(no, line) for no, line in enumerate(f.readlines())], "\n", None, None, @@ -185,11 +175,11 @@ def plain_extract(filename, include_filename_in_id=False, encoding="autodetect") # XXX hardcoded SBB normalization -def plain_text(filename, encoding="autodetect"): - return plain_extract(filename, encoding=encoding).text +def plain_text(filename): + return plain_extract(filename).text -def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"): +def extract(filename, *, textequiv_level="region"): """Extract the text from the given file. Supports PAGE, ALTO and falls back to plain text. @@ -197,7 +187,7 @@ def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"): try: tree = ET.parse(filename) except (XMLSyntaxError, UnicodeDecodeError): - return plain_extract(filename, encoding=plain_encoding) + return plain_extract(filename) try: return page_extract(tree, textequiv_level=textequiv_level) except ValueError: diff --git a/src/dinglehopper/ocrd-tool.json b/src/dinglehopper/ocrd-tool.json index ad48e51..27ee989 100644 --- a/src/dinglehopper/ocrd-tool.json +++ b/src/dinglehopper/ocrd-tool.json @@ -1,13 +1,17 @@ { - "version": "0.11.0", + "version": "0.9.6", "git_url": "https://github.com/qurator-spk/dinglehopper", - "dockerhub": "ocrd/dinglehopper", "tools": { "ocrd-dinglehopper": { "executable": "ocrd-dinglehopper", - "input_file_grp_cardinality": 2, - "output_file_grp_cardinality": 1, "description": "Evaluate OCR text against ground truth with dinglehopper", + "input_file_grp": [ + "OCR-D-GT-PAGE", + "OCR-D-OCR" + ], + "output_file_grp": [ + "OCR-D-OCR-EVAL" + ], "categories": [ "Quality assurance" ], @@ -25,11 +29,6 @@ "enum": ["region", "line"], "default": "region", "description": "PAGE XML hierarchy level to extract the text from" - }, - "plain_encoding": { - "type": "string", - "default": "autodetect", - "description": "Encoding (e.g. \"utf-8\") of plain text files" } } } diff --git a/src/dinglehopper/ocrd_cli.py b/src/dinglehopper/ocrd_cli.py index 2d7da8e..8eebdc0 100644 --- a/src/dinglehopper/ocrd_cli.py +++ b/src/dinglehopper/ocrd_cli.py @@ -1,78 +1,78 @@ -from functools import cached_property +import json import os -from typing import Optional import click -from ocrd_models import OcrdFileType from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import make_file_id +from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id +from pkg_resources import resource_string from .cli import process as cli_process +OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) + + @click.command() @ocrd_cli_options def ocrd_dinglehopper(*args, **kwargs): return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) + class OcrdDinglehopperEvaluate(Processor): + def __init__(self, *args, **kwargs): + kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] + super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) - @cached_property - def executable(self): - return 'ocrd-dinglehopper' + def process(self): + assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") + assert_file_grp_cardinality(self.output_file_grp, 1) - def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + log = getLogger("processor.OcrdDinglehopperEvaluate") - assert self.parameter metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] - plain_encoding = self.parameter["plain_encoding"] + gt_grp, ocr_grp = self.input_file_grp.split(",") - # wrong number of inputs: let fail - gt_file, ocr_file = input_files - # missing on either side: skip (zip_input_files already warned) - if not gt_file or not ocr_file: - return - # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): - if not gt_file.local_filename: - if config.OCRD_MISSING_INPUT == 'ABORT': - raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) - return - if not ocr_file.local_filename: - if config.OCRD_MISSING_INPUT == 'ABORT': - raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) - return + input_file_tuples = self.zip_input_files(on_error="abort") + for n, (gt_file, ocr_file) in enumerate(input_file_tuples): + if not gt_file or not ocr_file: + # file/page was not found in this group + continue + gt_file = self.workspace.download_file(gt_file) + ocr_file = self.workspace.download_file(ocr_file) + page_id = gt_file.pageId - page_id = gt_file.pageId + log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) - file_id = make_file_id(ocr_file, self.output_file_grp) - cli_process( - gt_file.local_filename, - ocr_file.local_filename, - file_id, - self.output_file_grp, - metrics=metrics, - textequiv_level=textequiv_level, - plain_encoding=plain_encoding, - ) + file_id = make_file_id(ocr_file, self.output_file_grp) + report_prefix = os.path.join(self.output_file_grp, file_id) - # Add reports to the workspace - for report_suffix, mimetype in [ - [".html", "text/html"], - [".json", "application/json"], - ]: - output_file_id = file_id + report_suffix - output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) - if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': - raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set") - self.workspace.add_file( - file_id=output_file_id, - file_grp=self.output_file_grp, - page_id=page_id, - mimetype=mimetype, - local_filename=file_id + report_suffix, + # Process the files + try: + os.mkdir(self.output_file_grp) + except FileExistsError: + pass + cli_process( + gt_file.local_filename, + ocr_file.local_filename, + report_prefix, + metrics=metrics, + textequiv_level=textequiv_level, ) + # Add reports to the workspace + for report_suffix, mimetype in [ + [".html", "text/html"], + [".json", "application/json"], + ]: + self.workspace.add_file( + file_id=file_id + report_suffix, + file_grp=self.output_file_grp, + page_id=page_id, + mimetype=mimetype, + local_filename=report_prefix + report_suffix, + ) + if __name__ == "__main__": ocrd_dinglehopper() diff --git a/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt deleted file mode 100644 index 484ba93..0000000 --- a/src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test. diff --git a/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt deleted file mode 100644 index fc9bd6a..0000000 --- a/src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt +++ /dev/null @@ -1 +0,0 @@ -Another test. diff --git a/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt deleted file mode 100644 index 27cf4bf..0000000 --- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/a.some-ocr.txt +++ /dev/null @@ -1 +0,0 @@ -Tis is a test. diff --git a/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt deleted file mode 100644 index 0bc0e40..0000000 --- a/src/dinglehopper/tests/data/line_dirs/basic/ocr/b.some-ocr.txt +++ /dev/null @@ -1 +0,0 @@ -AnÖther test. diff --git a/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/a/a.dummy.jpg deleted file mode 100644 index e69de29..0000000 diff --git a/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt deleted file mode 100644 index 484ba93..0000000 --- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test. diff --git a/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt deleted file mode 100644 index 27cf4bf..0000000 --- a/src/dinglehopper/tests/data/line_dirs/merged/a/a.some-ocr.txt +++ /dev/null @@ -1 +0,0 @@ -Tis is a test. diff --git a/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg b/src/dinglehopper/tests/data/line_dirs/merged/b/b.dummy.jpg deleted file mode 100644 index e69de29..0000000 diff --git a/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt deleted file mode 100644 index fc9bd6a..0000000 --- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt +++ /dev/null @@ -1 +0,0 @@ -Another test. diff --git a/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt deleted file mode 100644 index 0bc0e40..0000000 --- a/src/dinglehopper/tests/data/line_dirs/merged/b/b.some-ocr.txt +++ /dev/null @@ -1 +0,0 @@ -AnÖther test. diff --git a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt deleted file mode 100644 index 484ba93..0000000 --- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/a/a.gt.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test. diff --git a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt deleted file mode 100644 index fc9bd6a..0000000 --- a/src/dinglehopper/tests/data/line_dirs/subdirs/gt/b/b.gt.txt +++ /dev/null @@ -1 +0,0 @@ -Another test. diff --git a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt deleted file mode 100644 index 27cf4bf..0000000 --- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/a/a.some-ocr.txt +++ /dev/null @@ -1 +0,0 @@ -Tis is a test. diff --git a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt b/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt deleted file mode 100644 index 0bc0e40..0000000 --- a/src/dinglehopper/tests/data/line_dirs/subdirs/ocr/b/b.some-ocr.txt +++ /dev/null @@ -1 +0,0 @@ -AnÖther test. diff --git a/src/dinglehopper/tests/test_integ_cli_line_dirs.py b/src/dinglehopper/tests/test_integ_cli_line_dirs.py deleted file mode 100644 index 90cbabf..0000000 --- a/src/dinglehopper/tests/test_integ_cli_line_dirs.py +++ /dev/null @@ -1,61 +0,0 @@ -import json -import os.path -import re - -import pytest - -from ..cli_line_dirs import process -from .util import working_directory - -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") - - -@pytest.mark.integration -def test_cli_line_dirs_basic(tmp_path): - """Test that the cli/process() produces a good report""" - - with working_directory(tmp_path): - gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") - ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") - process(gt_dir, ocr_dir, "report") - with open("report.json", "r") as jsonf: - print(jsonf.read()) - with open("report.json", "r") as jsonf: - j = json.load(jsonf) - assert j["cer"] == pytest.approx(0.1071429) - assert j["wer"] == pytest.approx(0.5) - - -@pytest.mark.integration -def test_cli_line_dirs_basic_report_diff(tmp_path): - """Test that the cli/process() produces a report wiff char+word diff""" - - with working_directory(tmp_path): - gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") - ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") - process(gt_dir, ocr_dir, "report") - - with open("report.html", "r") as htmlf: - html_report = htmlf.read() - - # Counting GT lines in the diff - assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2 - assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2 - - -@pytest.mark.integration -def test_cli_line_dirs_merged(tmp_path): - """Test that the cli/process() produces a good report""" - - with working_directory(tmp_path): - gt_dir = os.path.join(data_dir, "line_dirs/merged") - ocr_dir = os.path.join(data_dir, "line_dirs/merged") - process( - gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt" - ) - with open("report.json", "r") as jsonf: - print(jsonf.read()) - with open("report.json", "r") as jsonf: - j = json.load(jsonf) - assert j["cer"] == pytest.approx(0.1071429) - assert j["wer"] == pytest.approx(0.5) diff --git a/src/dinglehopper/tests/test_integ_cli_valid_report.py b/src/dinglehopper/tests/test_integ_cli_valid_json.py similarity index 64% rename from src/dinglehopper/tests/test_integ_cli_valid_report.py rename to src/dinglehopper/tests/test_integ_cli_valid_json.py index fed0d28..6cbfa0c 100644 --- a/src/dinglehopper/tests/test_integ_cli_valid_report.py +++ b/src/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,5 +1,4 @@ import json -import re import pytest @@ -41,25 +40,3 @@ def test_cli_json_cer_is_infinity(tmp_path): with open("report.json", "r") as jsonf: j = json.load(jsonf) assert j["cer"] == pytest.approx(float("inf")) - - -@pytest.mark.integration -def test_cli_html(tmp_path): - """Test that the cli/process() yields complete HTML report""" - - with working_directory(tmp_path): - with open("gt.txt", "w") as gtf: - gtf.write("AAAAA") - with open("ocr.txt", "w") as ocrf: - ocrf.write("AAAAB") - - process("gt.txt", "ocr.txt", "report") - - with open("report.html", "r") as htmlf: - html_report = htmlf.read() - print(html_report) - - assert re.search(r"CER: 0\.\d+", html_report) - assert re.search(r"WER: 1\.0", html_report) - assert len(re.findall("gt.*cdiff", html_report)) == 1 - assert len(re.findall("gt.*wdiff", html_report)) == 1 diff --git a/src/dinglehopper/tests/test_line_dirs.py b/src/dinglehopper/tests/test_line_dirs.py deleted file mode 100644 index 03966e1..0000000 --- a/src/dinglehopper/tests/test_line_dirs.py +++ /dev/null @@ -1,71 +0,0 @@ -import os - -from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect - -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") - - -def test_basic(): - """Test the dumb method: User gives directories and suffixes.""" - pairs = list( - find_gt_and_ocr_files( - os.path.join(data_dir, "line_dirs/basic/gt"), - ".gt.txt", - os.path.join(data_dir, "line_dirs/basic/ocr"), - ".some-ocr.txt", - ) - ) - - assert len(pairs) == 2 - - -def test_basic_autodetect(): - """Test autodetect: User gives directories, suffixes are autodetected if possible""" - pairs = list( - find_gt_and_ocr_files_autodetect( - os.path.join(data_dir, "line_dirs/basic/gt"), - os.path.join(data_dir, "line_dirs/basic/ocr"), - ) - ) - - assert len(pairs) == 2 - - -def test_subdirs(): - """Test the dumb method: Should also work when subdirectories are involved.""" - pairs = list( - find_gt_and_ocr_files( - os.path.join(data_dir, "line_dirs/subdirs/gt"), - ".gt.txt", - os.path.join(data_dir, "line_dirs/subdirs/ocr"), - ".some-ocr.txt", - ) - ) - - assert len(pairs) == 2 - - -def test_subdirs_autodetect(): - """Test the autodetect method: Should also work when subdirectories are involved.""" - pairs = list( - find_gt_and_ocr_files_autodetect( - os.path.join(data_dir, "line_dirs/subdirs/gt"), - os.path.join(data_dir, "line_dirs/subdirs/ocr"), - ) - ) - - assert len(pairs) == 2 - - -def test_merged(): - """Test the dumb method: GT and OCR texts are in the same directories.""" - pairs = list( - find_gt_and_ocr_files( - os.path.join(data_dir, "line_dirs/merged"), - ".gt.txt", - os.path.join(data_dir, "line_dirs/merged"), - ".some-ocr.txt", - ) - ) - - assert len(pairs) == 2 diff --git a/src/dinglehopper/tests/test_ocr_files.py b/src/dinglehopper/tests/test_ocr_files.py index 0c2a500..4790c85 100644 --- a/src/dinglehopper/tests/test_ocr_files.py +++ b/src/dinglehopper/tests/test_ocr_files.py @@ -177,20 +177,8 @@ def test_text(): def test_plain(tmp_path): with working_directory(tmp_path): with open("ocr.txt", "w") as ocrf: - ocrf.write("First, a line.\nAnd a second line.\n") + ocrf.write("AAAAB") result = plain_text("ocr.txt") - expected = "First, a line.\nAnd a second line." - assert result == expected - - -def test_plain_BOM(tmp_path): - """Test that plain text files with BOM are read correctly.""" - BOM = "\ufeff" - with working_directory(tmp_path): - with open("ocr.txt", "w") as ocrf: - ocrf.write(BOM + "First, a line.\nAnd a second line.\n") - - result = plain_text("ocr.txt") - expected = "First, a line.\nAnd a second line." + expected = "AAAAB" assert result == expected diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py index f2db504..578850f 100644 --- a/src/dinglehopper/word_error_rate.py +++ b/src/dinglehopper/word_error_rate.py @@ -21,17 +21,12 @@ def patch_word_break(): https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt """ old_word_break = uniseg.wordbreak.word_break - if hasattr(uniseg.wordbreak, 'Word_Break'): - aletter = uniseg.wordbreak.Word_Break.ALetter - else: - # uniseg<0.9 - aletter = uniseg.wordbreak.WordBreak.ALETTER - def new_word_break(c): + def new_word_break(c, index=0): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area - return aletter + return uniseg.wordbreak.WordBreak.ALETTER else: - return old_word_break(c) + return old_word_break(c, index) uniseg.wordbreak.word_break = new_word_break global word_break_patched