Merge branch 'master' into performance

pull/103/head
Mike Gerber 12 months ago
commit 38fcbc8e1c

@ -1,23 +0,0 @@
version: 2.1
jobs:
test:
parameters:
python-version:
type: string
docker:
- image: cimg/python:<< parameters.python-version >>
steps:
- checkout
- run: pip3 install --upgrade pip
- run: pip3 install -r requirements.txt
- run: pip3 install pytest
- run: pytest
workflows:
all-tests:
jobs:
- test:
matrix:
parameters:
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]

@ -15,7 +15,7 @@ indent_size = 2
[*.json] [*.json]
indent_size = 2 indent_size = 2
insert_final_newline = false insert_final_newline = true
# trailing spaces in markdown indicate word wrap # trailing spaces in markdown indicate word wrap
[*.md] [*.md]

@ -0,0 +1,14 @@
#!/bin/bash
# We call setuptools.setup() here as we may rely on setuptools to interpret
# a dynamic version field. (Reading pyproject.toml is not enough in that case.)
expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
actual_git_tag="$(git describe --tags)"
if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
echo "OK: Python package version $expected_git_tag matches git tag"
exit 0
else
echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
exit 1
fi

@ -0,0 +1,69 @@
name: release
on:
push:
tags:
- "v*.*.*"
env:
PYPI_URL: https://pypi.org/p/dinglehopper
jobs:
test:
uses: ./.github/workflows/test.yml
build:
needs: test
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Upgrade pip
run: python3 -m pip install --upgrade pip
- name: Install setuptools
run: |
python3 -m pip install --upgrade setuptools
# For OCR-D tools, we need setuptools-ocrd to get the version
if [ -e ocrd-tool.json ]; then
python3 -m pip install setuptools-ocrd
fi
- name: Check git tag vs package version
run: .github/workflows/release-check-version-tag
- name: Build package
run: python3 -m pip install --upgrade build && python3 -m build
- name: Upload dist
uses: actions/upload-artifact@v3
with:
name: dist
path: dist/
github-release:
needs: build
runs-on: ubuntu-latest
steps:
- name: Download dist
uses: actions/download-artifact@v3
with:
name: dist
path: dist/
- name: Create release on GitHub
uses: softprops/action-gh-release@v1
with:
files: dist/*
pypi-publish:
needs: build
runs-on: ubuntu-latest
environment:
name: pypi
url: ${{ env.PYPI_URL }}
permissions:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps:
- name: Download dist
uses: actions/download-artifact@v3
with:
name: dist
path: dist/
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

@ -0,0 +1,76 @@
name: test
on:
push:
branches:
- master
pull_request:
branches:
- master
schedule:
- cron: "00 16 07 * *" # = monthly
# Allow manually running (from GitHub Web)
workflow_dispatch:
# Allow calling this workflow (e.g. from release workflow)
workflow_call:
jobs:
test:
strategy:
fail-fast: false
matrix:
python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
# For Python 3.6, we need to fall back to Ubuntu 20.04
runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
env:
test_results_dir: test-results-${{ matrix.python-version }}
steps:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Checkout
uses: actions/checkout@v3
- name: Update pip
run: python3 -m pip install -U pip
- name: Avoid compiling OpenCV and NumPy on Python 3.6
run: |
if python3 --version | grep -q "Python 3.6"; then
pip install --prefer-binary -U opencv-python-headless numpy
fi
- name: Install requirements*.txt
run: |
for requirements_txt in requirements*.txt; do
python3 -m pip install -r $requirements_txt;
done
- name: Test
run: |
cd src
mkdir -p ../$test_results_dir
python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
- name: Upload test results
uses: actions/upload-artifact@v3
if: success() || failure()
with:
name: ${{ env.test_results_dir }}
path: ${{ env.test_results_dir }}
- name: Report tests
uses: dorny/test-reporter@v1
if: success() || failure()
with:
name: Results on Python ${{ matrix.python-version }}
path: "${{env.test_results_dir }}/junit.xml"
reporter: java-junit

2
.gitignore vendored

@ -16,6 +16,7 @@ htmlcov/
.venv .venv
env/ env/
venv/ venv/
.python-version
# mypy # mypy
.mypy_cache/ .mypy_cache/
@ -27,3 +28,4 @@ dmypy.json
# Build artifacts # Build artifacts
/build /build
/dist

@ -0,0 +1,36 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-json
- id: check-toml
- id: check-yaml
- id: check-added-large-files
- id: check-ast
- repo: https://github.com/psf/black
rev: 23.10.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.1
hooks:
- args:
- --fix
- --exit-non-zero-on-fix
id: ruff
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.6.1
hooks:
- additional_dependencies:
- types-setuptools
id: mypy
- repo: https://gitlab.com/vojko.pribudic/pre-commit-update
rev: v0.1.0
hooks:
- id: pre-commit-update

@ -1,6 +1,6 @@
Testing Testing
======= =======
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests): Use `pytest` to run the tests in [the tests directory](dinglehopper/tests):
```bash ```bash
virtualenv -p /usr/bin/python3 venv virtualenv -p /usr/bin/python3 venv
. venv/bin/activate . venv/bin/activate
@ -10,6 +10,7 @@ pytest
``` ```
## Test running examples ## Test running examples
Only unit tests: Only unit tests:
```bash ```bash
pytest -m "not integration" pytest -m "not integration"
@ -27,11 +28,18 @@ pytest
All tests with code coverage: All tests with code coverage:
```bash ```bash
pytest --cov=qurator --cov-report=html pytest --cov=dinglehopper --cov-report=html
``` ```
Static code analysis: Static code analysis:
```bash ```bash
pytest -k "not test" --flake8
pytest -k "not test" --mypy pytest -k "not test" --mypy
pytest -k "not test" --ruff
``` ```
# How to use pre-commit
This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
- Install pre-commit, e.g. `pip install -r requirements-dev.txt`
- Install the repo-local git hooks: `pre-commit install`

@ -5,9 +5,13 @@ dinglehopper is an OCR evaluation tool and reads
[ALTO](https://github.com/altoxml), [ALTO](https://github.com/altoxml),
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
compares a ground truth (GT) document page with a OCR result page to compute compares a ground truth (GT) document page with a OCR result page to compute
metrics and a word/character differences report. metrics and a word/character differences report. It also supports batch processing by
generating, aggregating and summarizing multiple reports.
[![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper) [![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
[![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
[![License](https://img.shields.io/badge/License-Apache-blue)](#license)
[![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)
Goals Goals
----- -----
@ -19,15 +23,16 @@ Goals
Installation Installation
------------ ------------
It's best to use pip, e.g.:
~~~ It's best to use pip to install the package from PyPI, e.g.:
sudo pip install . ```
~~~ pip install dinglehopper
```
Usage Usage
----- -----
~~~ ~~~
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -35,19 +40,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
their text and falls back to plain text if no ALTO or PAGE is detected. their text and falls back to plain text if no ALTO or PAGE is detected.
The files GT and OCR are usually a ground truth document and the result of The files GT and OCR are usually a ground truth document and the result of
an OCR software, but you may use dinglehopper to compare two OCR results. an OCR software, but you may use dinglehopper to compare two OCR results. In
In that case, use --no-metrics to disable the then meaningless metrics and that case, use --no-metrics to disable the then meaningless metrics and also
also change the color scheme from green/red to blue. change the color scheme from green/red to blue.
The comparison report will be written to $REPORT_PREFIX.{html,json}, where The comparison report will be written to
$REPORT_PREFIX defaults to "report". The reports include the character $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
error rate (CER) and the word error rate (WER). to the current working directory and $REPORT_PREFIX defaults to "report".
The reports include the character error rate (CER) and the word error rate
(WER).
By default, the text of PAGE files is extracted on 'region' level. You may By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags. use "--textequiv-level line" to extract from the level of TextLine tags.
Options: Options:
--metrics / --no-metrics Enable/disable metrics and green/red --metrics / --no-metrics Enable/disable metrics and green/red
--differences BOOLEAN Enable reporting character and word level
differences
--textequiv-level LEVEL PAGE TextEquiv level to extract text from --textequiv-level LEVEL PAGE TextEquiv level to extract text from
--progress Show progress bar --progress Show progress bar
--help Show this message and exit. --help Show this message and exit.
@ -61,6 +70,43 @@ This generates `report.html` and `report.json`.
![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true) ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
Batch comparison between folders of GT and OCR files can be done by simply providing
folders:
~~~
dinglehopper gt/ ocr/ report output_folder/
~~~
This assumes that you have files with the same name in both folders, e.g.
`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
The example generates reports for each set of files, with the prefix `report`, in the
(automatically created) folder `output_folder/`.
By default, the JSON report does not contain the character and word differences, only
the calculated metrics. If you want to include the differences, use the
`--differences` flag:
~~~
dinglehopper gt/ ocr/ report output_folder/ --differences
~~~
### dinglehopper-summarize
A set of (JSON) reports can be summarized into a single set of
reports. This is useful after having generated reports in batch.
Example:
~~~
dinglehopper-summarize output_folder/
~~~
This generates `summary.html` and `summary.json` in the same `output_folder`.
If you are summarizing many reports and have used the `--differences` flag while
generating them, it may be useful to limit the number of differences reported by using
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
report, making it easier to open and navigate. Note that the JSON report will still
contain all differences. Example:
~~~
dinglehopper-summarize output_folder/ --occurences-threshold 10
~~~
### dinglehopper-line-dirs ### dinglehopper-line-dirs
You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`) You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate

@ -1 +1 @@
qurator/dinglehopper/ocrd-tool.json src/dinglehopper/ocrd-tool.json

@ -0,0 +1,70 @@
[build-system]
requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
[project]
name = "dinglehopper"
authors = [
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
]
description = "The OCR evaluation tool"
readme = "README.md"
requires-python = ">=3.6"
keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
dynamic = ["version", "dependencies", "optional-dependencies"]
# https://pypi.org/classifiers/
classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"Intended Audience :: Science/Research",
"Intended Audience :: Other Audience",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing",
]
[project.scripts]
dinglehopper = "dinglehopper.cli:main"
dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
dinglehopper-extract = "dinglehopper.cli_extract:main"
dinglehopper-summarize = "dinglehopper.cli_summarize:main"
ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
[project.urls]
Homepage = "https://github.com/qurator-spk/dinglehopper"
Repository = "https://github.com/qurator-spk/dinglehopper.git"
[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
optional-dependencies.dev = {file = ["requirements-dev.txt"]}
[tool.setuptools.packages.find]
where = ["src"]
[tool.setuptools.package-data]
dinglehopper = ["templates/*"]
[tool.pytest.ini_options]
minversion = 6.0
addopts = "--strict-markers"
markers = [
"integration: integration tests",
]
[tool.mypy]
ignore_missing_imports = true
[tool.ruff]
select = ["E", "F", "I"]
ignore = [
"F811", # multimethods are considered redefinitions by ruff
]

@ -1,4 +0,0 @@
[pytest]
markers =
integration: integration tests
serial

@ -1 +0,0 @@
__import__("pkg_resources").declare_namespace(__name__)

@ -1,5 +0,0 @@
from .ocr_files import *
from .extracted_text import *
from .character_error_rate import *
from .word_error_rate import *
from .align import *

@ -1,15 +0,0 @@
function find_diff_class(classes) {
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
}
$(document).ready(function() {
/* Enable Bootstrap tooltips */
$('[data-toggle="tooltip"]').tooltip();
$('.diff').mouseover(function() {
find_diff_class($(this).attr('class')).addClass('diff-highlight');
});
$('.diff').mouseout(function() {
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
});
});

@ -1,5 +1,8 @@
pytest pytest
pytest-flake8
pytest-cov pytest-cov
pytest-mypy pytest-mypy
black black
pre-commit
ruff ; python_version >= "3.7"
pytest-ruff ; python_version >= "3.7"

@ -10,4 +10,4 @@ attrs
multimethod >= 1.3 multimethod >= 1.3
tqdm tqdm
rapidfuzz >= 2.7.0 rapidfuzz >= 2.7.0
six # XXX workaround OCR-D/core#730 chardet

@ -1,12 +0,0 @@
[flake8]
max-line-length = 88
extend-ignore = E203, W503
[pylint]
max-line-length = 88
[pylint.messages_control]
disable = C0330, C0326
[mypy]
ignore_missing_imports = True

@ -1,34 +0,0 @@
from io import open
from setuptools import find_packages, setup
with open("requirements.txt") as fp:
install_requires = fp.read()
with open('requirements-dev.txt') as fp:
tests_require = fp.read()
setup(
name="dinglehopper",
author="Mike Gerber, The QURATOR SPK Team",
author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
description="The OCR evaluation tool",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
keywords="qurator ocr",
license="Apache",
namespace_packages=["qurator"],
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=install_requires,
tests_require=tests_require,
package_data={
"": ["*.json", "templates/*"],
},
entry_points={
"console_scripts": [
"dinglehopper=qurator.dinglehopper.cli:main",
"dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
"dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
"ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
]
},
)

@ -0,0 +1,33 @@
from .align import align, score_hint, seq_align
from .character_error_rate import character_error_rate, character_error_rate_n
from .edit_distance import distance, editops
from .extracted_text import ExtractedText
from .ocr_files import (
alto_namespace,
alto_text,
page_namespace,
page_text,
plain_text,
text,
)
from .word_error_rate import word_error_rate, word_error_rate_n, words
__all__ = [
"editops",
"distance",
"align",
"score_hint",
"seq_align",
"character_error_rate",
"character_error_rate_n",
"word_error_rate",
"word_error_rate_n",
"words",
"ExtractedText",
"alto_namespace",
"alto_text",
"page_namespace",
"page_text",
"plain_text",
"text",
]

@ -1,9 +1,12 @@
import math import math
import unicodedata
from math import ceil from math import ceil
from .edit_distance import *
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from .edit_distance import grapheme_clusters
def align(t1, t2): def align(t1, t2):
"""Align text.""" """Align text."""
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1))) s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))

@ -1,20 +1,22 @@
import os import os
from collections import Counter
import click import click
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from markupsafe import escape from markupsafe import escape
from ocrd_utils import initLogging from ocrd_utils import initLogging
from math import ceil
from .character_error_rate import character_error_rate_n from dinglehopper.align import score_hint, seq_align
from .word_error_rate import word_error_rate_n, words_normalized from dinglehopper.character_error_rate import character_error_rate_n
from .align import seq_align, score_hint from dinglehopper.config import Config
from .extracted_text import ExtractedText from dinglehopper.extracted_text import ExtractedText
from .ocr_files import extract from dinglehopper.ocr_files import extract
from .config import Config from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None): def gen_diff_report(
gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
):
gtx = "" gtx = ""
ocrx = "" ocrx = ""
@ -31,16 +33,12 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
# Set Bootstrap tooltip to the segment id # Set Bootstrap tooltip to the segment id
if id_: if id_:
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) html_custom_attrs += f'data-toggle="tooltip" title="{id_}"'
if css_classes: if css_classes:
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format( return f'<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'
css_classes=css_classes,
html_t=html_t,
html_custom_attrs=html_custom_attrs,
)
else: else:
return "{html_t}".format(html_t=html_t) return f"{html_t}"
if isinstance(gt_in, ExtractedText): if isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText): if not isinstance(ocr_in, ExtractedText):
@ -53,6 +51,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
g_pos = 0 g_pos = 0
o_pos = 0 o_pos = 0
found_differences = []
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)): for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
css_classes = None css_classes = None
gt_id = None gt_id = None
@ -65,6 +65,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
# Deletions and inserts only produce one id + None, UI must # Deletions and inserts only produce one id + None, UI must
# support this, i.e. display for the one id produced # support this, i.e. display for the one id produced
if differences:
found_differences.append(f"{g} :: {o}")
gtx += joiner + format_thing(g, css_classes, gt_id) gtx += joiner + format_thing(g, css_classes, gt_id)
ocrx += joiner + format_thing(o, css_classes, ocr_id) ocrx += joiner + format_thing(o, css_classes, ocr_id)
@ -73,13 +76,18 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
if o is not None: if o is not None:
o_pos += len(o) o_pos += len(o)
return """ found_differences = dict(Counter(elem for elem in found_differences))
return (
"""
<div class="row"> <div class="row">
<div class="col-md-6 gt">{}</div> <div class="col-md-6 gt">{}</div>
<div class="col-md-6 ocr">{}</div> <div class="col-md-6 ocr">{}</div>
</div> </div>
""".format( """.format(
gtx, ocrx gtx, ocrx
),
found_differences,
) )
@ -96,11 +104,20 @@ def json_float(value):
return str(value) return str(value)
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): def process(
gt,
ocr,
report_prefix,
reports_folder=".",
*,
metrics=True,
differences=False,
textequiv_level="region",
):
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use The @click decorators change the signature of the decorated functions, so we keep
Click on a wrapper. this undecorated version and use Click on a wrapper.
""" """
gt_text = extract(gt, textequiv_level=textequiv_level) gt_text = extract(gt, textequiv_level=textequiv_level)
@ -109,15 +126,25 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
ocr_words = words_normalized(ocr_text) ocr_words = words_normalized(ocr_text)
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
char_diff_report = gen_diff_report( char_diff_report, diff_c = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·", gt_text,
score_hint=score_hint(cer, n_characters) ocr_text,
css_prefix="c",
joiner="",
none="·",
score_hint=score_hint(cer, n_characters),
differences=differences,
) )
wer, n_words = word_error_rate_n(gt_words, ocr_words) wer, n_words = word_error_rate_n(gt_words, ocr_words)
word_diff_report = gen_diff_report( word_diff_report, diff_w = gen_diff_report(
gt_words, ocr_words, css_prefix="w", joiner=" ", none="", gt_words,
score_hint=score_hint(wer, n_words) ocr_words,
css_prefix="w",
joiner=" ",
none="",
score_hint=score_hint(wer, n_words),
differences=differences,
) )
env = Environment( env = Environment(
@ -129,7 +156,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
for report_suffix in (".html", ".json"): for report_suffix in (".html", ".json"):
template_fn = "report" + report_suffix + ".j2" template_fn = "report" + report_suffix + ".j2"
out_fn = report_prefix + report_suffix
if not os.path.isdir(reports_folder):
os.mkdir(reports_folder)
out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
template = env.get_template(template_fn) template = env.get_template(template_fn)
template.stream( template.stream(
@ -142,16 +173,46 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
char_diff_report=char_diff_report, char_diff_report=char_diff_report,
word_diff_report=word_diff_report, word_diff_report=word_diff_report,
metrics=metrics, metrics=metrics,
differences=differences,
diff_c=diff_c,
diff_w=diff_w,
).dump(out_fn) ).dump(out_fn)
def process_dir(
gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
):
for gt_file in os.listdir(gt):
gt_file_path = os.path.join(gt, gt_file)
ocr_file_path = os.path.join(ocr, gt_file)
if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
process(
gt_file_path,
ocr_file_path,
f"{gt_file}-{report_prefix}",
reports_folder=reports_folder,
metrics=metrics,
differences=differences,
textequiv_level=textequiv_level,
)
else:
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@click.command() @click.command()
@click.argument("gt", type=click.Path(exists=True)) @click.argument("gt", type=click.Path(exists=True))
@click.argument("ocr", type=click.Path(exists=True)) @click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report") @click.argument("report_prefix", type=click.Path(), default="report")
@click.argument("reports_folder", type=click.Path(), default=".")
@click.option( @click.option(
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
) )
@click.option(
"--differences",
default=False,
help="Enable reporting character and word level differences",
)
@click.option( @click.option(
"--textequiv-level", "--textequiv-level",
default="region", default="region",
@ -159,7 +220,16 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
metavar="LEVEL", metavar="LEVEL",
) )
@click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): def main(
gt,
ocr,
report_prefix,
reports_folder,
metrics,
differences,
textequiv_level,
progress,
):
""" """
Compare the PAGE/ALTO/text document GT against the document OCR. Compare the PAGE/ALTO/text document GT against the document OCR.
@ -171,7 +241,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
that case, use --no-metrics to disable the then meaningless metrics and also that case, use --no-metrics to disable the then meaningless metrics and also
change the color scheme from green/red to blue. change the color scheme from green/red to blue.
The comparison report will be written to $REPORT_PREFIX.{html,json}, where The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
where $REPORTS_FOLDER defaults to the current working directory and
$REPORT_PREFIX defaults to "report". The reports include the character error $REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER). rate (CER) and the word error rate (WER).
@ -180,7 +251,31 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
""" """
initLogging() initLogging()
Config.progress = progress Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) if os.path.isdir(gt):
if not os.path.isdir(ocr):
raise click.BadParameter(
"OCR must be a directory if GT is a directory", param_hint="ocr"
)
else:
process_dir(
gt,
ocr,
report_prefix,
reports_folder,
metrics,
differences,
textequiv_level,
)
else:
process(
gt,
ocr,
report_prefix,
reports_folder,
metrics=metrics,
differences=differences,
textequiv_level=textequiv_level,
)
if __name__ == "__main__": if __name__ == "__main__":

@ -1,15 +1,15 @@
import os
import itertools import itertools
import os
import click import click
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from ocrd_utils import initLogging from ocrd_utils import initLogging
from math import ceil
from .align import score_hint
from .character_error_rate import character_error_rate_n from .character_error_rate import character_error_rate_n
from .word_error_rate import word_error_rate_n, words_normalized
from .ocr_files import plain_extract
from .cli import gen_diff_report, json_float from .cli import gen_diff_report, json_float
from .ocr_files import plain_extract
from .word_error_rate import word_error_rate_n, words_normalized
def all_equal(iterable): def all_equal(iterable):
@ -75,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
# Generate diff reports # Generate diff reports
char_diff_report += gen_diff_report( char_diff_report += gen_diff_report(
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·", gt_text,
score_hint=score_hint(l_cer, l_n_characters) ocr_text,
css_prefix="l{0}-c".format(k),
joiner="",
none="·",
score_hint=score_hint(l_cer, l_n_characters),
) )
word_diff_report += gen_diff_report( word_diff_report += gen_diff_report(
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="", gt_words,
score_hint=score_hint(l_wer, l_n_words) ocr_words,
css_prefix="l{0}-w".format(k),
joiner=" ",
none="",
score_hint=score_hint(l_wer, l_n_words),
) )
env = Environment( env = Environment(

@ -0,0 +1,106 @@
import json
import os
import click
from jinja2 import Environment, FileSystemLoader
from ocrd_utils import initLogging
from dinglehopper.cli import json_float
def process(reports_folder, occurrences_threshold=1):
cer_list = []
wer_list = []
cer_sum = 0
wer_sum = 0
diff_c = {}
diff_w = {}
for report in os.listdir(reports_folder):
if report.endswith(".json"):
with open(os.path.join(reports_folder, report), "r") as f:
report_data = json.load(f)
if "cer" not in report_data or "wer" not in report_data:
click.echo(
f"Skipping {report} because it does not contain CER and WER"
)
continue
cer = report_data["cer"]
wer = report_data["wer"]
cer_list.append(cer)
wer_list.append(wer)
cer_sum += cer
wer_sum += wer
try:
for key, value in report_data["differences"][
"character_level"
].items():
diff_c[key] = diff_c.get(key, 0) + value
for key, value in report_data["differences"]["word_level"].items():
diff_w[key] = diff_w.get(key, 0) + value
except KeyError:
pass
if len(cer_list) == 0:
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
return
cer_avg = cer_sum / len(cer_list)
wer_avg = wer_sum / len(wer_list)
print(f"Number of reports: {len(cer_list)}")
print(f"Average CER: {cer_avg}")
print(f"Average WER: {wer_avg}")
print(f"Sum of common mistakes: {cer_sum}")
print(f"Sum of common mistakes: {wer_sum}")
env = Environment(
loader=FileSystemLoader(
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
)
)
env.filters["json_float"] = json_float
for report_suffix in (".html", ".json"):
template_fn = "summary" + report_suffix + ".j2"
out_fn = os.path.join(reports_folder, "summary" + report_suffix)
template = env.get_template(template_fn)
template.stream(
num_reports=len(cer_list),
cer_avg=cer_avg,
wer_avg=wer_avg,
diff_c=diff_c,
diff_w=diff_w,
occurrences_threshold=occurrences_threshold,
).dump(out_fn)
@click.command()
@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
@click.option(
"--occurrences-threshold",
type=int,
default=1,
help="Only show differences that occur at least this many times.",
)
def main(reports_folder, occurrences_threshold):
"""
Summarize the results from multiple reports generated earlier by dinglehopper.
It calculates the average CER and WER, as well as a sum of common mistakes.
Reports include lists of mistakes and their occurrences.
You may use a threshold to reduce the file size of the HTML report by only showing
mistakes whose number of occurrences is above the threshold. The JSON report will
always contain all mistakes.
All JSON files in the provided folder will be gathered and summarized.
"""
initLogging()
process(reports_folder, occurrences_threshold)
if __name__ == "__main__":
main()

@ -1,8 +1,8 @@
import unicodedata import unicodedata
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from uniseg.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText from .extracted_text import ExtractedText

@ -2,6 +2,7 @@ import os
import sys import sys
from typing import Iterator from typing import Iterator
import chardet
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
@ -12,8 +13,8 @@ from .extracted_text import ExtractedText, normalize_sbb
def alto_namespace(tree: ET.ElementTree) -> str: def alto_namespace(tree: ET.ElementTree) -> str:
"""Return the ALTO namespace used in the given ElementTree. """Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not This relies on the assumption that, in any given ALTO file, the root element has the
check if the files uses any valid ALTO namespace. local name "alto". We do not check if the files uses any valid ALTO namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "alto": if root_name.localname == "alto":
@ -48,8 +49,9 @@ def alto_text(tree):
def page_namespace(tree): def page_namespace(tree):
"""Return the PAGE content namespace used in the given ElementTree. """Return the PAGE content namespace used in the given ElementTree.
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We This relies on the assumption that, in any given PAGE content file, the root element
do not check if the files uses any valid PAGE namespace. has the local name "PcGts". We do not check if the files uses any valid PAGE
namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "PcGts": if root_name.localname == "PcGts":
@ -135,6 +137,10 @@ def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text return page_extract(tree, textequiv_level=textequiv_level).text
def detect_encoding(filename):
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
def plain_extract(filename, include_filename_in_id=False): def plain_extract(filename, include_filename_in_id=False):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
@ -149,7 +155,8 @@ def plain_extract(filename, include_filename_in_id=False):
clusters, clusters,
) )
with open(filename, "r") as f: fileencoding = detect_encoding(filename)
with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText( return ExtractedText(
None, None,
[make_segment(no, line) for no, line in enumerate(f.readlines())], [make_segment(no, line) for no, line in enumerate(f.readlines())],
@ -171,7 +178,7 @@ def extract(filename, *, textequiv_level="region"):
""" """
try: try:
tree = ET.parse(filename) tree = ET.parse(filename)
except XMLSyntaxError: except (XMLSyntaxError, UnicodeDecodeError):
return plain_extract(filename) return plain_extract(filename)
try: try:
return page_extract(tree, textequiv_level=textequiv_level) return page_extract(tree, textequiv_level=textequiv_level)

@ -1,4 +1,5 @@
{ {
"version": "0.9.4",
"git_url": "https://github.com/qurator-spk/dinglehopper", "git_url": "https://github.com/qurator-spk/dinglehopper",
"tools": { "tools": {
"ocrd-dinglehopper": { "ocrd-dinglehopper": {

@ -4,7 +4,7 @@ import os
import click import click
from ocrd import Processor from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
from pkg_resources import resource_string from pkg_resources import resource_string
from .cli import process as cli_process from .cli import process as cli_process

@ -26,6 +26,22 @@
border: 2px solid; border: 2px solid;
border-radius: 5px; border-radius: 5px;
} }
.row {
margin-bottom: 20px;
}
table {
width: 100%;
}
th {
cursor: pointer;
}
th:hover {
background-color: #eee;
}
</style> </style>
</head> </head>
<body> <body>
@ -50,6 +66,32 @@
<h2>Word differences</h2> <h2>Word differences</h2>
{{ word_diff_report }} {{ word_diff_report }}
{%- if differences %}
{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
<div class="row">
{% for section in sections %}
<div class="col-md-6">
<h2>{{ section['title'] }}</h2>
<table>
<thead>
<tr>
<th>GT</th>
<th>OCR</th>
<th>Occurrences</th>
</tr>
{% for gt_ocr, occurrences in section['data'].items() %}
<tr>
<td>{{ gt_ocr.split("::")[0] }}</td>
<td>{{ gt_ocr.split("::")[1] }}</td>
<td>{{ occurrences }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endfor %}
</div>
{%- endif %}
</div> </div>

@ -0,0 +1,39 @@
function find_diff_class(classes) {
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
}
$(document).ready(function() {
/* Enable Bootstrap tooltips */
$('[data-toggle="tooltip"]').tooltip();
$('.diff').mouseover(function() {
find_diff_class($(this).attr('class')).addClass('diff-highlight');
});
$('.diff').mouseout(function() {
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
});
/* Sort this column of the table */
$('th').click(function () {
var table = $(this).closest('table');
var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
this.asc = !this.asc;
if (!this.asc) {
rows = rows.reverse();
}
for (var i = 0; i < rows.length; i++) {
table.children('tbody').append(rows[i]);
}
});
function compareRows(index) {
return function (row1, row2) {
var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
return cell1.localeCompare(cell2, undefined, {
numeric: true,
sensitivity: 'base'
});
}
}
});

@ -4,6 +4,12 @@
{% if metrics %} {% if metrics %}
"cer": {{ cer|json_float }}, "cer": {{ cer|json_float }},
"wer": {{ wer|json_float }}, "wer": {{ wer|json_float }},
{% endif %}
{% if differences %}
"differences": {
"character_level": {{ diff_c|tojson }},
"word_level": {{ diff_w|tojson }}
},
{% endif %} {% endif %}
"n_characters": {{ n_characters }}, "n_characters": {{ n_characters }},
"n_words": {{ n_words }} "n_words": {{ n_words }}

@ -0,0 +1,136 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<style type="text/css">
{% if metrics %}
.gt .diff {
color: green;
}
.ocr .diff {
color: red;
}
{% else %}
.gt .diff, .ocr .diff {
color: blue;
}
{% endif %}
.ellipsis {
opacity: 0.5;
font-style: italic;
}
.diff-highlight {
border: 2px solid;
border-radius: 5px;
}
.row {
margin-bottom: 20px;
}
table {
width: 100%;
}
.cer {
flex-direction: column;
}
tr:hover {
background-color: #f5f5f5;
}
th {
cursor: pointer;
}
th:hover {
background-color: #eee;
}
td {
min-width: 100px;
}
td:hover {
background-color: #eee;
}
</style>
</head>
<body>
<div class="container">
<div class="row">
<h1>Summary of all reports</h1>
</div>
<div class="row">
<p>Number of reports: {{ num_reports }}</p>
</div>
{% if cer_avg and wer_avg -%}
<div class="row">
<h2>Metrics</h2>
</div>
<div class="row cer">
<p>Average CER: {{ cer_avg|round(4) }}</p>
<p>Average WER: {{ wer_avg|round(4) }}</p>
</div>
{% endif %}
{%- if diff_c and diff_w %}
{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
<div class="row">
{%- for section in sections %}
<div class="col-md-6">
<h2>{{ section['title'] }}</h2>
<table>
<thead>
<tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
</thead>
{%- set num_omitted = namespace(value=0) -%}
{% for gt_ocr, occurrences in section['data'].items() -%}
{% if occurrences < occurrences_threshold -%}
{%- set num_omitted.value = num_omitted.value + 1 %}
{%- else -%}
{%- set gt = gt_ocr.split(" :: ")[0] %}
{%- set ocr = gt_ocr.split(" :: ")[1] %}
<tr>
<td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
<td title="{{ ocr|urlencode }}">{{ ocr }}</td >
<td>{{ occurrences }}</td>
</tr>
{%- endif %}
{%- endfor %}
{% if num_omitted.value > 0 and occurrences_threshold > 1 -%}
<p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
{%- set num_omitted.value = 0 %}
{%- endif %}
</table>
</div>
{%- endfor %}
</div>
{%- endif %}
</div>
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
<script>
{% include 'report.html.js' %}
</script>
</body>
</html>

@ -0,0 +1,15 @@
{
"num_reports": {{ num_reports}}
{%- if cer_avg and wer_avg %}
,
"cer_avg": {{ cer_avg|json_float }},
"wer_avg": {{ wer_avg|json_float }}
{%- endif %}
{%- if diff_c and wer_avg %}
,
"differences": {
"character_level": {{ diff_c|tojson }},
"word_level": {{ diff_w|tojson }}
}
{%- endif %}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -6,7 +6,7 @@ import pytest
from lxml import etree as ET from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .. import seq_align, ExtractedText from .. import ExtractedText, seq_align
def test_text(): def test_text():
@ -30,12 +30,20 @@ def test_text():
def test_normalization_check(): def test_normalization_check():
with pytest.raises(ValueError, match=r".*is not in NFC.*"): with pytest.raises(ValueError, match=r".*is not in NFC.*"):
ExtractedText("foo", None, None, ExtractedText(
"foo",
None,
None,
unicodedata.normalize("NFD", "Schlyñ"), unicodedata.normalize("NFD", "Schlyñ"),
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ"))) grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
assert ExtractedText("foo", None, None, )
assert ExtractedText(
"foo",
None,
None,
unicodedata.normalize("NFC", "Schlyñ"), unicodedata.normalize("NFC", "Schlyñ"),
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ"))) grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
)
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id") AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")

@ -1,7 +1,9 @@
import math import math
import pytest import pytest
from .. import align, distance, score_hint, seq_align
from .util import unzip from .util import unzip
from .. import align, seq_align, distance, score_hint
def test_left_empty(): def test_left_empty():
@ -72,7 +74,8 @@ def test_with_some_fake_ocr_errors():
result = list( result = list(
align( align(
"Über die vielen Sorgen wegen desselben vergaß", "Über die vielen Sorgen wegen desselben vergaß",
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab", "SomeJunk MoreJunk "
+ "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
) )
) )
left, right = unzip(result) left, right = unzip(result)
@ -183,6 +186,7 @@ def test_lines_similar():
# Test __eq__ (i.e. is it a substitution or a similar string?) # Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0] assert list(left)[0] == list(right)[0]
def test_score_hint(): def test_score_hint():
assert score_hint(0.5, 23) == 12 # int(ceil()) assert score_hint(0.5, 23) == 12 # int(ceil())
assert score_hint(math.inf, 12345) is None assert score_hint(math.inf, 12345) is None

@ -36,6 +36,7 @@ def test_character_error_rate_hard():
len(s2) == 7 len(s2) == 7
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points ) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical. # Both strings have the same length in terms of grapheme clusters. So the CER should
# be symmetrical.
assert character_error_rate(s2, s1) == 1 / 6 assert character_error_rate(s2, s1) == 1 / 6
assert character_error_rate(s1, s2) == 1 / 6 assert character_error_rate(s1, s2) == 1 / 6

@ -15,7 +15,9 @@ def test_align_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# → 2 elements in the alignment should be different, the ligature is # → 2 elements in the alignment should be different, the ligature is
# (currently) not counted due to normalization. # (currently) not counted due to normalization.
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters. #
# NOTE: In this example, it doesn't matter that we work with "characters", not
# grapheme clusters.
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml"))) ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))

@ -0,0 +1,28 @@
from __future__ import division, print_function
import os
import pytest
from lxml import etree as ET
from .. import alto_text, character_error_rate, page_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_bigger_texts():
gt = page_text(
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
)
ocr = alto_text(
ET.parse(
os.path.join(
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
)
)
)
# Only interested in a result here: In earlier versions this would have used
# tens of GB of RAM and should now not break a sweat.
assert character_error_rate(gt, ocr) >= 0.0

@ -6,7 +6,7 @@ import pytest
from lxml import etree as ET from lxml import etree as ET
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .. import character_error_rate, page_text, alto_text from .. import alto_text, character_error_rate, page_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

@ -0,0 +1,53 @@
import os
import pytest
from ocrd_utils import initLogging
from dinglehopper.cli import process_dir
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_cli_directory(tmp_path):
"""
Test that the cli/process_dir() processes a directory of files and
yields JSON and HTML reports.
"""
initLogging()
process_dir(
os.path.join(data_dir, "directory-test", "gt"),
os.path.join(data_dir, "directory-test", "ocr"),
"report",
str(tmp_path / "reports"),
False,
True,
"line",
)
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
assert os.path.exists(tmp_path / "reports/1.xml-report.html")
assert os.path.exists(tmp_path / "reports/2.xml-report.json")
assert os.path.exists(tmp_path / "reports/2.xml-report.html")
@pytest.mark.integration
def test_cli_fail_without_gt(tmp_path):
"""
Test that the cli/process_dir skips a file if there is no corresponding file
in the other directory.
"""
initLogging()
process_dir(
os.path.join(data_dir, "directory-test", "gt"),
os.path.join(data_dir, "directory-test", "ocr"),
"report",
str(tmp_path / "reports"),
False,
True,
"line",
)
assert len(os.listdir(tmp_path / "reports")) == 2 * 2

@ -1,9 +1,9 @@
import json import json
import pytest import pytest
from .util import working_directory
from ..cli import process from ..cli import process
from .util import working_directory
@pytest.mark.integration @pytest.mark.integration

@ -0,0 +1,37 @@
import json
import os
import pytest
from ocrd_utils import initLogging
from dinglehopper.cli import process
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_cli_differences(tmp_path):
"""Test that the cli/process() yields a JSON report that includes
the differences found between the GT and OCR text"""
initLogging()
process(
os.path.join(data_dir, "test-gt.page2018.xml"),
os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
"report",
tmp_path,
differences=True,
)
assert os.path.exists(tmp_path / "report.json")
with open(tmp_path / "report.json", "r") as jsonf:
j = json.load(jsonf)
assert j["differences"] == {
"character_level": {"n :: m": 1, "ſ :: f": 1},
"word_level": {
"Augenblick :: Augemblick": 1,
"Verſprochene :: Verfprochene": 1,
},
}

@ -5,7 +5,7 @@ import os
import pytest import pytest
from lxml import etree as ET from lxml import etree as ET
from .. import distance, page_text, alto_text from .. import alto_text, distance, page_text
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

@ -1,21 +1,20 @@
import json
import os import os
import shutil import shutil
import json
import sys import sys
from pathlib import Path from pathlib import Path
import pytest import pytest
from click.testing import CliRunner from click.testing import CliRunner
from .util import working_directory
from ..ocrd_cli import ocrd_dinglehopper from ..ocrd_cli import ocrd_dinglehopper
from .util import working_directory
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix") @pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
def test_ocrd_cli(tmp_path): def test_ocrd_cli(tmp_path):
"""Test OCR-D interface""" """Test OCR-D interface"""

@ -0,0 +1,110 @@
import json
import os
import pytest
from .. import cli_summarize
from .util import working_directory
expected_cer_avg = (0.05 + 0.10) / 2
expected_wer_avg = (0.15 + 0.20) / 2
expected_diff_c = {"a": 30, "b": 50}
expected_diff_w = {"c": 70, "d": 90}
@pytest.fixture
def create_summaries(tmp_path):
"""Create two summary reports with mock data"""
reports_dirname = tmp_path / "reports"
reports_dirname.mkdir()
report1 = {
"cer": 0.05,
"wer": 0.15,
"differences": {
"character_level": {"a": 10, "b": 20},
"word_level": {"c": 30, "d": 40},
},
}
report2 = {
"cer": 0.10,
"wer": 0.20,
"differences": {
"character_level": {"a": 20, "b": 30},
"word_level": {"c": 40, "d": 50},
},
}
with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
json.dump(report1, f)
with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
json.dump(report2, f)
return str(reports_dirname)
@pytest.mark.integration
def test_cli_summarize_json(tmp_path, create_summaries):
"""Test that the cli/process() yields a summarized JSON report"""
with working_directory(tmp_path):
reports_dirname = create_summaries
cli_summarize.process(reports_dirname)
with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
summary_data = json.load(f)
assert summary_data["num_reports"] == 2
assert summary_data["cer_avg"] == expected_cer_avg
assert summary_data["wer_avg"] == expected_wer_avg
assert summary_data["differences"]["character_level"] == expected_diff_c
assert summary_data["differences"]["word_level"] == expected_diff_w
@pytest.mark.integration
def test_cli_summarize_html(tmp_path, create_summaries):
"""Test that the cli/process() yields an HTML report"""
with working_directory(tmp_path):
reports_dirname = create_summaries
cli_summarize.process(reports_dirname)
html_file = os.path.join(reports_dirname, "summary.html")
assert os.path.isfile(html_file)
with open(html_file, "r") as f:
contents = f.read()
assert len(contents) > 0
assert "Number of reports: 2" in contents
assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
@pytest.mark.integration
def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
"""
Test that the cli/process() does not include reports that are missing a WER value.
"""
with working_directory(tmp_path):
reports_dirname = create_summaries
# This third report has no WER value and should not be included in the summary
report3 = {
"cer": 0.10,
"differences": {
"character_level": {"a": 20, "b": 30},
"word_level": {"c": 40, "d": 50},
},
}
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
json.dump(report3, f)
cli_summarize.process(reports_dirname)
html_file = os.path.join(reports_dirname, "summary.html")
assert os.path.isfile(html_file)
with open(html_file, "r") as f:
contents = f.read()
assert "Number of reports: 2" in contents # report3 is not included

@ -5,15 +5,15 @@ import os
import pytest import pytest
from lxml import etree as ET from lxml import etree as ET
from .. import word_error_rate, words, page_text, alto_text from .. import alto_text, page_text, word_error_rate, words
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration @pytest.mark.integration
def test_word_error_rate_between_page_files(): def test_word_error_rate_between_page_files():
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words, # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
# the ligature does not count → 2 errors # So we have 3 changed words, the ligature does not count → 2 errors
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml"))) gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
gt_word_count = ( gt_word_count = (

@ -1,13 +1,11 @@
import os import os
import re import re
import lxml.etree as ET
import textwrap import textwrap
import pytest import lxml.etree as ET
from .util import working_directory
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
from .util import working_directory
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@ -161,7 +159,8 @@ def test_page_level():
result = page_text(tree, textequiv_level="line") result = page_text(tree, textequiv_level="line")
assert ( assert (
result result
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-" == "Hand, Mylord? fragte der Graf von Rocheſter.\n"
+ "Als er einsmals in dem Oberhauſe eine Bill we-"
) )

@ -27,7 +27,8 @@ def test_words():
def test_words_private_use_area(): def test_words_private_use_area():
result = list( result = list(
words( words(
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n" "ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
"der Frau Amtmnnin das ver⸗\n"
"ſproene zu berliefern." "ſproene zu berliefern."
) )
) )

@ -1,8 +1,8 @@
import os
from itertools import zip_longest from itertools import zip_longest
from typing import Iterable from typing import Iterable
import colorama import colorama
import os
def diffprint(x, y): def diffprint(x, y):

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save