mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-01 06:29:59 +02:00
Merge branch 'master' into performance
This commit is contained in:
commit
38fcbc8e1c
101 changed files with 58154 additions and 199 deletions
|
@ -1,23 +0,0 @@
|
|||
version: 2.1
|
||||
|
||||
jobs:
|
||||
test:
|
||||
parameters:
|
||||
python-version:
|
||||
type: string
|
||||
docker:
|
||||
- image: cimg/python:<< parameters.python-version >>
|
||||
steps:
|
||||
- checkout
|
||||
- run: pip3 install --upgrade pip
|
||||
- run: pip3 install -r requirements.txt
|
||||
- run: pip3 install pytest
|
||||
- run: pytest
|
||||
|
||||
workflows:
|
||||
all-tests:
|
||||
jobs:
|
||||
- test:
|
||||
matrix:
|
||||
parameters:
|
||||
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
|
|
@ -15,7 +15,7 @@ indent_size = 2
|
|||
|
||||
[*.json]
|
||||
indent_size = 2
|
||||
insert_final_newline = false
|
||||
insert_final_newline = true
|
||||
|
||||
# trailing spaces in markdown indicate word wrap
|
||||
[*.md]
|
||||
|
|
14
.github/workflows/release-check-version-tag
vendored
Executable file
14
.github/workflows/release-check-version-tag
vendored
Executable file
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
# We call setuptools.setup() here as we may rely on setuptools to interpret
|
||||
# a dynamic version field. (Reading pyproject.toml is not enough in that case.)
|
||||
expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
|
||||
actual_git_tag="$(git describe --tags)"
|
||||
|
||||
if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
|
||||
echo "OK: Python package version $expected_git_tag matches git tag"
|
||||
exit 0
|
||||
else
|
||||
echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
|
||||
exit 1
|
||||
fi
|
69
.github/workflows/release.yml
vendored
Normal file
69
.github/workflows/release.yml
vendored
Normal file
|
@ -0,0 +1,69 @@
|
|||
name: release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*.*.*"
|
||||
|
||||
env:
|
||||
PYPI_URL: https://pypi.org/p/dinglehopper
|
||||
|
||||
jobs:
|
||||
test:
|
||||
uses: ./.github/workflows/test.yml
|
||||
|
||||
build:
|
||||
needs: test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Upgrade pip
|
||||
run: python3 -m pip install --upgrade pip
|
||||
- name: Install setuptools
|
||||
run: |
|
||||
python3 -m pip install --upgrade setuptools
|
||||
# For OCR-D tools, we need setuptools-ocrd to get the version
|
||||
if [ -e ocrd-tool.json ]; then
|
||||
python3 -m pip install setuptools-ocrd
|
||||
fi
|
||||
- name: Check git tag vs package version
|
||||
run: .github/workflows/release-check-version-tag
|
||||
- name: Build package
|
||||
run: python3 -m pip install --upgrade build && python3 -m build
|
||||
- name: Upload dist
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
||||
|
||||
github-release:
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Download dist
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
||||
- name: Create release on GitHub
|
||||
uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
files: dist/*
|
||||
|
||||
pypi-publish:
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: ${{ env.PYPI_URL }}
|
||||
permissions:
|
||||
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
||||
steps:
|
||||
- name: Download dist
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
76
.github/workflows/test.yml
vendored
Normal file
76
.github/workflows/test.yml
vendored
Normal file
|
@ -0,0 +1,76 @@
|
|||
name: test
|
||||
|
||||
on:
|
||||
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
|
||||
schedule:
|
||||
- cron: "00 16 07 * *" # = monthly
|
||||
|
||||
# Allow manually running (from GitHub Web)
|
||||
workflow_dispatch:
|
||||
|
||||
# Allow calling this workflow (e.g. from release workflow)
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
|
||||
|
||||
# For Python 3.6, we need to fall back to Ubuntu 20.04
|
||||
runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
|
||||
|
||||
env:
|
||||
test_results_dir: test-results-${{ matrix.python-version }}
|
||||
|
||||
steps:
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Update pip
|
||||
run: python3 -m pip install -U pip
|
||||
- name: Avoid compiling OpenCV and NumPy on Python 3.6
|
||||
run: |
|
||||
if python3 --version | grep -q "Python 3.6"; then
|
||||
pip install --prefer-binary -U opencv-python-headless numpy
|
||||
fi
|
||||
- name: Install requirements*.txt
|
||||
run: |
|
||||
for requirements_txt in requirements*.txt; do
|
||||
python3 -m pip install -r $requirements_txt;
|
||||
done
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd src
|
||||
mkdir -p ../$test_results_dir
|
||||
python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
|
||||
- name: Upload test results
|
||||
uses: actions/upload-artifact@v3
|
||||
if: success() || failure()
|
||||
with:
|
||||
name: ${{ env.test_results_dir }}
|
||||
path: ${{ env.test_results_dir }}
|
||||
|
||||
- name: Report tests
|
||||
uses: dorny/test-reporter@v1
|
||||
if: success() || failure()
|
||||
with:
|
||||
name: Results on Python ${{ matrix.python-version }}
|
||||
path: "${{env.test_results_dir }}/junit.xml"
|
||||
reporter: java-junit
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -16,6 +16,7 @@ htmlcov/
|
|||
.venv
|
||||
env/
|
||||
venv/
|
||||
.python-version
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
@ -27,3 +28,4 @@ dmypy.json
|
|||
|
||||
# Build artifacts
|
||||
/build
|
||||
/dist
|
||||
|
|
36
.pre-commit-config.yaml
Normal file
36
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,36 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.5.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-json
|
||||
- id: check-toml
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
- id: check-ast
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.1.1
|
||||
hooks:
|
||||
- args:
|
||||
- --fix
|
||||
- --exit-non-zero-on-fix
|
||||
id: ruff
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.6.1
|
||||
hooks:
|
||||
- additional_dependencies:
|
||||
- types-setuptools
|
||||
id: mypy
|
||||
|
||||
- repo: https://gitlab.com/vojko.pribudic/pre-commit-update
|
||||
rev: v0.1.0
|
||||
hooks:
|
||||
- id: pre-commit-update
|
|
@ -1,6 +1,6 @@
|
|||
Testing
|
||||
=======
|
||||
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
|
||||
Use `pytest` to run the tests in [the tests directory](dinglehopper/tests):
|
||||
```bash
|
||||
virtualenv -p /usr/bin/python3 venv
|
||||
. venv/bin/activate
|
||||
|
@ -10,6 +10,7 @@ pytest
|
|||
```
|
||||
|
||||
## Test running examples
|
||||
|
||||
Only unit tests:
|
||||
```bash
|
||||
pytest -m "not integration"
|
||||
|
@ -27,11 +28,18 @@ pytest
|
|||
|
||||
All tests with code coverage:
|
||||
```bash
|
||||
pytest --cov=qurator --cov-report=html
|
||||
pytest --cov=dinglehopper --cov-report=html
|
||||
```
|
||||
|
||||
Static code analysis:
|
||||
```bash
|
||||
pytest -k "not test" --flake8
|
||||
pytest -k "not test" --mypy
|
||||
pytest -k "not test" --ruff
|
||||
```
|
||||
|
||||
# How to use pre-commit
|
||||
|
||||
This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
|
||||
|
||||
- Install pre-commit, e.g. `pip install -r requirements-dev.txt`
|
||||
- Install the repo-local git hooks: `pre-commit install`
|
||||
|
|
72
README.md
72
README.md
|
@ -5,9 +5,13 @@ dinglehopper is an OCR evaluation tool and reads
|
|||
[ALTO](https://github.com/altoxml),
|
||||
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
|
||||
compares a ground truth (GT) document page with a OCR result page to compute
|
||||
metrics and a word/character differences report.
|
||||
metrics and a word/character differences report. It also supports batch processing by
|
||||
generating, aggregating and summarizing multiple reports.
|
||||
|
||||
[](https://circleci.com/gh/qurator-spk/dinglehopper)
|
||||
[](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
|
||||
[](https://github.com/qurator-spk/dinglehopper/releases/)
|
||||
[](#license)
|
||||
[](https://github.com/qurator-spk/dinglehopper/issues)
|
||||
|
||||
Goals
|
||||
-----
|
||||
|
@ -19,15 +23,16 @@ Goals
|
|||
|
||||
Installation
|
||||
------------
|
||||
It's best to use pip, e.g.:
|
||||
~~~
|
||||
sudo pip install .
|
||||
~~~
|
||||
|
||||
It's best to use pip to install the package from PyPI, e.g.:
|
||||
```
|
||||
pip install dinglehopper
|
||||
```
|
||||
|
||||
Usage
|
||||
-----
|
||||
~~~
|
||||
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
||||
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
|
||||
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
|
@ -35,19 +40,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
|||
their text and falls back to plain text if no ALTO or PAGE is detected.
|
||||
|
||||
The files GT and OCR are usually a ground truth document and the result of
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results.
|
||||
In that case, use --no-metrics to disable the then meaningless metrics and
|
||||
also change the color scheme from green/red to blue.
|
||||
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character
|
||||
error rate (CER) and the word error rate (WER).
|
||||
The comparison report will be written to
|
||||
$REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
|
||||
to the current working directory and $REPORT_PREFIX defaults to "report".
|
||||
The reports include the character error rate (CER) and the word error rate
|
||||
(WER).
|
||||
|
||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||
|
||||
Options:
|
||||
--metrics / --no-metrics Enable/disable metrics and green/red
|
||||
--differences BOOLEAN Enable reporting character and word level
|
||||
differences
|
||||
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
|
||||
--progress Show progress bar
|
||||
--help Show this message and exit.
|
||||
|
@ -61,6 +70,43 @@ This generates `report.html` and `report.json`.
|
|||
|
||||

|
||||
|
||||
Batch comparison between folders of GT and OCR files can be done by simply providing
|
||||
folders:
|
||||
~~~
|
||||
dinglehopper gt/ ocr/ report output_folder/
|
||||
~~~
|
||||
This assumes that you have files with the same name in both folders, e.g.
|
||||
`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
|
||||
|
||||
The example generates reports for each set of files, with the prefix `report`, in the
|
||||
(automatically created) folder `output_folder/`.
|
||||
|
||||
By default, the JSON report does not contain the character and word differences, only
|
||||
the calculated metrics. If you want to include the differences, use the
|
||||
`--differences` flag:
|
||||
|
||||
~~~
|
||||
dinglehopper gt/ ocr/ report output_folder/ --differences
|
||||
~~~
|
||||
|
||||
### dinglehopper-summarize
|
||||
A set of (JSON) reports can be summarized into a single set of
|
||||
reports. This is useful after having generated reports in batch.
|
||||
Example:
|
||||
~~~
|
||||
dinglehopper-summarize output_folder/
|
||||
~~~
|
||||
This generates `summary.html` and `summary.json` in the same `output_folder`.
|
||||
|
||||
If you are summarizing many reports and have used the `--differences` flag while
|
||||
generating them, it may be useful to limit the number of differences reported by using
|
||||
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
|
||||
report, making it easier to open and navigate. Note that the JSON report will still
|
||||
contain all differences. Example:
|
||||
~~~
|
||||
dinglehopper-summarize output_folder/ --occurences-threshold 10
|
||||
~~~
|
||||
|
||||
### dinglehopper-line-dirs
|
||||
You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
|
||||
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
|
||||
|
|
|
@ -1 +1 @@
|
|||
qurator/dinglehopper/ocrd-tool.json
|
||||
src/dinglehopper/ocrd-tool.json
|
70
pyproject.toml
Normal file
70
pyproject.toml
Normal file
|
@ -0,0 +1,70 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
|
||||
|
||||
[project]
|
||||
name = "dinglehopper"
|
||||
authors = [
|
||||
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
|
||||
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
|
||||
]
|
||||
description = "The OCR evaluation tool"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.6"
|
||||
keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
|
||||
|
||||
dynamic = ["version", "dependencies", "optional-dependencies"]
|
||||
|
||||
# https://pypi.org/classifiers/
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: Other Audience",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Topic :: Scientific/Engineering :: Information Analysis",
|
||||
"Topic :: Text Processing",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
dinglehopper = "dinglehopper.cli:main"
|
||||
dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
|
||||
dinglehopper-extract = "dinglehopper.cli_extract:main"
|
||||
dinglehopper-summarize = "dinglehopper.cli_summarize:main"
|
||||
ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
|
||||
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/qurator-spk/dinglehopper"
|
||||
Repository = "https://github.com/qurator-spk/dinglehopper.git"
|
||||
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
dependencies = {file = ["requirements.txt"]}
|
||||
optional-dependencies.dev = {file = ["requirements-dev.txt"]}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
dinglehopper = ["templates/*"]
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = 6.0
|
||||
addopts = "--strict-markers"
|
||||
markers = [
|
||||
"integration: integration tests",
|
||||
]
|
||||
|
||||
|
||||
[tool.mypy]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
[tool.ruff]
|
||||
select = ["E", "F", "I"]
|
||||
ignore = [
|
||||
"F811", # multimethods are considered redefinitions by ruff
|
||||
]
|
|
@ -1,4 +0,0 @@
|
|||
[pytest]
|
||||
markers =
|
||||
integration: integration tests
|
||||
serial
|
|
@ -1 +0,0 @@
|
|||
__import__("pkg_resources").declare_namespace(__name__)
|
|
@ -1,5 +0,0 @@
|
|||
from .ocr_files import *
|
||||
from .extracted_text import *
|
||||
from .character_error_rate import *
|
||||
from .word_error_rate import *
|
||||
from .align import *
|
|
@ -1,15 +0,0 @@
|
|||
function find_diff_class(classes) {
|
||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
/* Enable Bootstrap tooltips */
|
||||
$('[data-toggle="tooltip"]').tooltip();
|
||||
|
||||
$('.diff').mouseover(function() {
|
||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
});
|
|
@ -1,5 +1,8 @@
|
|||
pytest
|
||||
pytest-flake8
|
||||
pytest-cov
|
||||
pytest-mypy
|
||||
black
|
||||
pre-commit
|
||||
|
||||
ruff ; python_version >= "3.7"
|
||||
pytest-ruff ; python_version >= "3.7"
|
||||
|
|
|
@ -10,4 +10,4 @@ attrs
|
|||
multimethod >= 1.3
|
||||
tqdm
|
||||
rapidfuzz >= 2.7.0
|
||||
six # XXX workaround OCR-D/core#730
|
||||
chardet
|
||||
|
|
12
setup.cfg
12
setup.cfg
|
@ -1,12 +0,0 @@
|
|||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203, W503
|
||||
|
||||
[pylint]
|
||||
max-line-length = 88
|
||||
|
||||
[pylint.messages_control]
|
||||
disable = C0330, C0326
|
||||
|
||||
[mypy]
|
||||
ignore_missing_imports = True
|
34
setup.py
34
setup.py
|
@ -1,34 +0,0 @@
|
|||
from io import open
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
with open("requirements.txt") as fp:
|
||||
install_requires = fp.read()
|
||||
|
||||
with open('requirements-dev.txt') as fp:
|
||||
tests_require = fp.read()
|
||||
|
||||
setup(
|
||||
name="dinglehopper",
|
||||
author="Mike Gerber, The QURATOR SPK Team",
|
||||
author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
|
||||
description="The OCR evaluation tool",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
keywords="qurator ocr",
|
||||
license="Apache",
|
||||
namespace_packages=["qurator"],
|
||||
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
||||
install_requires=install_requires,
|
||||
tests_require=tests_require,
|
||||
package_data={
|
||||
"": ["*.json", "templates/*"],
|
||||
},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"dinglehopper=qurator.dinglehopper.cli:main",
|
||||
"dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
|
||||
"dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
|
||||
"ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
|
||||
]
|
||||
},
|
||||
)
|
33
src/dinglehopper/__init__.py
Normal file
33
src/dinglehopper/__init__.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
from .align import align, score_hint, seq_align
|
||||
from .character_error_rate import character_error_rate, character_error_rate_n
|
||||
from .edit_distance import distance, editops
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import (
|
||||
alto_namespace,
|
||||
alto_text,
|
||||
page_namespace,
|
||||
page_text,
|
||||
plain_text,
|
||||
text,
|
||||
)
|
||||
from .word_error_rate import word_error_rate, word_error_rate_n, words
|
||||
|
||||
__all__ = [
|
||||
"editops",
|
||||
"distance",
|
||||
"align",
|
||||
"score_hint",
|
||||
"seq_align",
|
||||
"character_error_rate",
|
||||
"character_error_rate_n",
|
||||
"word_error_rate",
|
||||
"word_error_rate_n",
|
||||
"words",
|
||||
"ExtractedText",
|
||||
"alto_namespace",
|
||||
"alto_text",
|
||||
"page_namespace",
|
||||
"page_text",
|
||||
"plain_text",
|
||||
"text",
|
||||
]
|
|
@ -1,9 +1,12 @@
|
|||
import math
|
||||
import unicodedata
|
||||
from math import ceil
|
||||
|
||||
from .edit_distance import *
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
from .edit_distance import grapheme_clusters
|
||||
|
||||
|
||||
def align(t1, t2):
|
||||
"""Align text."""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
|
|
@ -1,20 +1,22 @@
|
|||
import os
|
||||
from collections import Counter
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from markupsafe import escape
|
||||
from ocrd_utils import initLogging
|
||||
from math import ceil
|
||||
|
||||
from .character_error_rate import character_error_rate_n
|
||||
from .word_error_rate import word_error_rate_n, words_normalized
|
||||
from .align import seq_align, score_hint
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import extract
|
||||
from .config import Config
|
||||
from dinglehopper.align import score_hint, seq_align
|
||||
from dinglehopper.character_error_rate import character_error_rate_n
|
||||
from dinglehopper.config import Config
|
||||
from dinglehopper.extracted_text import ExtractedText
|
||||
from dinglehopper.ocr_files import extract
|
||||
from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
|
||||
|
||||
|
||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
||||
def gen_diff_report(
|
||||
gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
|
||||
):
|
||||
gtx = ""
|
||||
ocrx = ""
|
||||
|
||||
|
@ -31,16 +33,12 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
|||
|
||||
# Set Bootstrap tooltip to the segment id
|
||||
if id_:
|
||||
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
||||
html_custom_attrs += f'data-toggle="tooltip" title="{id_}"'
|
||||
|
||||
if css_classes:
|
||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
|
||||
css_classes=css_classes,
|
||||
html_t=html_t,
|
||||
html_custom_attrs=html_custom_attrs,
|
||||
)
|
||||
return f'<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'
|
||||
else:
|
||||
return "{html_t}".format(html_t=html_t)
|
||||
return f"{html_t}"
|
||||
|
||||
if isinstance(gt_in, ExtractedText):
|
||||
if not isinstance(ocr_in, ExtractedText):
|
||||
|
@ -53,6 +51,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
|||
|
||||
g_pos = 0
|
||||
o_pos = 0
|
||||
found_differences = []
|
||||
|
||||
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
|
||||
css_classes = None
|
||||
gt_id = None
|
||||
|
@ -65,6 +65,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
|||
# Deletions and inserts only produce one id + None, UI must
|
||||
# support this, i.e. display for the one id produced
|
||||
|
||||
if differences:
|
||||
found_differences.append(f"{g} :: {o}")
|
||||
|
||||
gtx += joiner + format_thing(g, css_classes, gt_id)
|
||||
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
||||
|
||||
|
@ -73,13 +76,18 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
|||
if o is not None:
|
||||
o_pos += len(o)
|
||||
|
||||
return """
|
||||
found_differences = dict(Counter(elem for elem in found_differences))
|
||||
|
||||
return (
|
||||
"""
|
||||
<div class="row">
|
||||
<div class="col-md-6 gt">{}</div>
|
||||
<div class="col-md-6 ocr">{}</div>
|
||||
</div>
|
||||
""".format(
|
||||
gtx, ocrx
|
||||
gtx, ocrx
|
||||
),
|
||||
found_differences,
|
||||
)
|
||||
|
||||
|
||||
|
@ -96,11 +104,20 @@ def json_float(value):
|
|||
return str(value)
|
||||
|
||||
|
||||
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||
def process(
|
||||
gt,
|
||||
ocr,
|
||||
report_prefix,
|
||||
reports_folder=".",
|
||||
*,
|
||||
metrics=True,
|
||||
differences=False,
|
||||
textequiv_level="region",
|
||||
):
|
||||
"""Check OCR result against GT.
|
||||
|
||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
||||
Click on a wrapper.
|
||||
The @click decorators change the signature of the decorated functions, so we keep
|
||||
this undecorated version and use Click on a wrapper.
|
||||
"""
|
||||
|
||||
gt_text = extract(gt, textequiv_level=textequiv_level)
|
||||
|
@ -109,15 +126,25 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
ocr_words = words_normalized(ocr_text)
|
||||
|
||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
char_diff_report = gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="c", joiner="", none="·",
|
||||
score_hint=score_hint(cer, n_characters)
|
||||
char_diff_report, diff_c = gen_diff_report(
|
||||
gt_text,
|
||||
ocr_text,
|
||||
css_prefix="c",
|
||||
joiner="",
|
||||
none="·",
|
||||
score_hint=score_hint(cer, n_characters),
|
||||
differences=differences,
|
||||
)
|
||||
|
||||
wer, n_words = word_error_rate_n(gt_words, ocr_words)
|
||||
word_diff_report = gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
|
||||
score_hint=score_hint(wer, n_words)
|
||||
word_diff_report, diff_w = gen_diff_report(
|
||||
gt_words,
|
||||
ocr_words,
|
||||
css_prefix="w",
|
||||
joiner=" ",
|
||||
none="⋯",
|
||||
score_hint=score_hint(wer, n_words),
|
||||
differences=differences,
|
||||
)
|
||||
|
||||
env = Environment(
|
||||
|
@ -129,7 +156,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "report" + report_suffix + ".j2"
|
||||
out_fn = report_prefix + report_suffix
|
||||
|
||||
if not os.path.isdir(reports_folder):
|
||||
os.mkdir(reports_folder)
|
||||
|
||||
out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
|
||||
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
|
@ -142,16 +173,46 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
def process_dir(
|
||||
gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
|
||||
):
|
||||
for gt_file in os.listdir(gt):
|
||||
gt_file_path = os.path.join(gt, gt_file)
|
||||
ocr_file_path = os.path.join(ocr, gt_file)
|
||||
|
||||
if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
|
||||
process(
|
||||
gt_file_path,
|
||||
ocr_file_path,
|
||||
f"{gt_file}-{report_prefix}",
|
||||
reports_folder=reports_folder,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
else:
|
||||
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("gt", type=click.Path(exists=True))
|
||||
@click.argument("ocr", type=click.Path(exists=True))
|
||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||
@click.argument("reports_folder", type=click.Path(), default=".")
|
||||
@click.option(
|
||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||
)
|
||||
@click.option(
|
||||
"--differences",
|
||||
default=False,
|
||||
help="Enable reporting character and word level differences",
|
||||
)
|
||||
@click.option(
|
||||
"--textequiv-level",
|
||||
default="region",
|
||||
|
@ -159,7 +220,16 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
|||
metavar="LEVEL",
|
||||
)
|
||||
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
||||
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||
def main(
|
||||
gt,
|
||||
ocr,
|
||||
report_prefix,
|
||||
reports_folder,
|
||||
metrics,
|
||||
differences,
|
||||
textequiv_level,
|
||||
progress,
|
||||
):
|
||||
"""
|
||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||
|
||||
|
@ -171,7 +241,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
|||
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||
change the color scheme from green/red to blue.
|
||||
|
||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
||||
The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
|
||||
where $REPORTS_FOLDER defaults to the current working directory and
|
||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||
rate (CER) and the word error rate (WER).
|
||||
|
||||
|
@ -180,7 +251,31 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
|||
"""
|
||||
initLogging()
|
||||
Config.progress = progress
|
||||
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
|
||||
if os.path.isdir(gt):
|
||||
if not os.path.isdir(ocr):
|
||||
raise click.BadParameter(
|
||||
"OCR must be a directory if GT is a directory", param_hint="ocr"
|
||||
)
|
||||
else:
|
||||
process_dir(
|
||||
gt,
|
||||
ocr,
|
||||
report_prefix,
|
||||
reports_folder,
|
||||
metrics,
|
||||
differences,
|
||||
textequiv_level,
|
||||
)
|
||||
else:
|
||||
process(
|
||||
gt,
|
||||
ocr,
|
||||
report_prefix,
|
||||
reports_folder,
|
||||
metrics=metrics,
|
||||
differences=differences,
|
||||
textequiv_level=textequiv_level,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
|
@ -1,15 +1,15 @@
|
|||
import os
|
||||
import itertools
|
||||
import os
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from ocrd_utils import initLogging
|
||||
from math import ceil
|
||||
|
||||
from .align import score_hint
|
||||
from .character_error_rate import character_error_rate_n
|
||||
from .word_error_rate import word_error_rate_n, words_normalized
|
||||
from .ocr_files import plain_extract
|
||||
from .cli import gen_diff_report, json_float
|
||||
from .ocr_files import plain_extract
|
||||
from .word_error_rate import word_error_rate_n, words_normalized
|
||||
|
||||
|
||||
def all_equal(iterable):
|
||||
|
@ -75,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
|||
|
||||
# Generate diff reports
|
||||
char_diff_report += gen_diff_report(
|
||||
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
|
||||
score_hint=score_hint(l_cer, l_n_characters)
|
||||
gt_text,
|
||||
ocr_text,
|
||||
css_prefix="l{0}-c".format(k),
|
||||
joiner="",
|
||||
none="·",
|
||||
score_hint=score_hint(l_cer, l_n_characters),
|
||||
)
|
||||
word_diff_report += gen_diff_report(
|
||||
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
|
||||
score_hint=score_hint(l_wer, l_n_words)
|
||||
gt_words,
|
||||
ocr_words,
|
||||
css_prefix="l{0}-w".format(k),
|
||||
joiner=" ",
|
||||
none="⋯",
|
||||
score_hint=score_hint(l_wer, l_n_words),
|
||||
)
|
||||
|
||||
env = Environment(
|
106
src/dinglehopper/cli_summarize.py
Normal file
106
src/dinglehopper/cli_summarize.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from dinglehopper.cli import json_float
|
||||
|
||||
|
||||
def process(reports_folder, occurrences_threshold=1):
|
||||
cer_list = []
|
||||
wer_list = []
|
||||
cer_sum = 0
|
||||
wer_sum = 0
|
||||
diff_c = {}
|
||||
diff_w = {}
|
||||
|
||||
for report in os.listdir(reports_folder):
|
||||
if report.endswith(".json"):
|
||||
with open(os.path.join(reports_folder, report), "r") as f:
|
||||
report_data = json.load(f)
|
||||
|
||||
if "cer" not in report_data or "wer" not in report_data:
|
||||
click.echo(
|
||||
f"Skipping {report} because it does not contain CER and WER"
|
||||
)
|
||||
continue
|
||||
|
||||
cer = report_data["cer"]
|
||||
wer = report_data["wer"]
|
||||
cer_list.append(cer)
|
||||
wer_list.append(wer)
|
||||
cer_sum += cer
|
||||
wer_sum += wer
|
||||
|
||||
try:
|
||||
for key, value in report_data["differences"][
|
||||
"character_level"
|
||||
].items():
|
||||
diff_c[key] = diff_c.get(key, 0) + value
|
||||
for key, value in report_data["differences"]["word_level"].items():
|
||||
diff_w[key] = diff_w.get(key, 0) + value
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
if len(cer_list) == 0:
|
||||
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
|
||||
return
|
||||
|
||||
cer_avg = cer_sum / len(cer_list)
|
||||
wer_avg = wer_sum / len(wer_list)
|
||||
|
||||
print(f"Number of reports: {len(cer_list)}")
|
||||
print(f"Average CER: {cer_avg}")
|
||||
print(f"Average WER: {wer_avg}")
|
||||
print(f"Sum of common mistakes: {cer_sum}")
|
||||
print(f"Sum of common mistakes: {wer_sum}")
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "summary" + report_suffix + ".j2"
|
||||
|
||||
out_fn = os.path.join(reports_folder, "summary" + report_suffix)
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
num_reports=len(cer_list),
|
||||
cer_avg=cer_avg,
|
||||
wer_avg=wer_avg,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
occurrences_threshold=occurrences_threshold,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
|
||||
@click.option(
|
||||
"--occurrences-threshold",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Only show differences that occur at least this many times.",
|
||||
)
|
||||
def main(reports_folder, occurrences_threshold):
|
||||
"""
|
||||
Summarize the results from multiple reports generated earlier by dinglehopper.
|
||||
It calculates the average CER and WER, as well as a sum of common mistakes.
|
||||
Reports include lists of mistakes and their occurrences.
|
||||
|
||||
You may use a threshold to reduce the file size of the HTML report by only showing
|
||||
mistakes whose number of occurrences is above the threshold. The JSON report will
|
||||
always contain all mistakes.
|
||||
|
||||
All JSON files in the provided folder will be gathered and summarized.
|
||||
"""
|
||||
initLogging()
|
||||
process(reports_folder, occurrences_threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,8 +1,8 @@
|
|||
import unicodedata
|
||||
|
||||
from multimethod import multimethod
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
|
|
@ -2,6 +2,7 @@ import os
|
|||
import sys
|
||||
from typing import Iterator
|
||||
|
||||
import chardet
|
||||
from lxml import etree as ET
|
||||
from lxml.etree import XMLSyntaxError
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
@ -12,8 +13,8 @@ from .extracted_text import ExtractedText, normalize_sbb
|
|||
def alto_namespace(tree: ET.ElementTree) -> str:
|
||||
"""Return the ALTO namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
||||
check if the files uses any valid ALTO namespace.
|
||||
This relies on the assumption that, in any given ALTO file, the root element has the
|
||||
local name "alto". We do not check if the files uses any valid ALTO namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == "alto":
|
||||
|
@ -48,8 +49,9 @@ def alto_text(tree):
|
|||
def page_namespace(tree):
|
||||
"""Return the PAGE content namespace used in the given ElementTree.
|
||||
|
||||
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
||||
do not check if the files uses any valid PAGE namespace.
|
||||
This relies on the assumption that, in any given PAGE content file, the root element
|
||||
has the local name "PcGts". We do not check if the files uses any valid PAGE
|
||||
namespace.
|
||||
"""
|
||||
root_name = ET.QName(tree.getroot().tag)
|
||||
if root_name.localname == "PcGts":
|
||||
|
@ -135,6 +137,10 @@ def page_text(tree, *, textequiv_level="region"):
|
|||
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||
|
||||
|
||||
def detect_encoding(filename):
|
||||
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
|
||||
|
||||
|
||||
def plain_extract(filename, include_filename_in_id=False):
|
||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||
|
||||
|
@ -149,7 +155,8 @@ def plain_extract(filename, include_filename_in_id=False):
|
|||
clusters,
|
||||
)
|
||||
|
||||
with open(filename, "r") as f:
|
||||
fileencoding = detect_encoding(filename)
|
||||
with open(filename, "r", encoding=fileencoding) as f:
|
||||
return ExtractedText(
|
||||
None,
|
||||
[make_segment(no, line) for no, line in enumerate(f.readlines())],
|
||||
|
@ -171,7 +178,7 @@ def extract(filename, *, textequiv_level="region"):
|
|||
"""
|
||||
try:
|
||||
tree = ET.parse(filename)
|
||||
except XMLSyntaxError:
|
||||
except (XMLSyntaxError, UnicodeDecodeError):
|
||||
return plain_extract(filename)
|
||||
try:
|
||||
return page_extract(tree, textequiv_level=textequiv_level)
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"version": "0.9.4",
|
||||
"git_url": "https://github.com/qurator-spk/dinglehopper",
|
||||
"tools": {
|
||||
"ocrd-dinglehopper": {
|
|
@ -4,7 +4,7 @@ import os
|
|||
import click
|
||||
from ocrd import Processor
|
||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
||||
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
|
||||
from pkg_resources import resource_string
|
||||
|
||||
from .cli import process as cli_process
|
|
@ -26,6 +26,22 @@
|
|||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
@ -50,6 +66,32 @@
|
|||
<h2>Word differences</h2>
|
||||
{{ word_diff_report }}
|
||||
|
||||
{%- if differences %}
|
||||
{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{% for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>GT</th>
|
||||
<th>OCR</th>
|
||||
<th>Occurrences</th>
|
||||
</tr>
|
||||
{% for gt_ocr, occurrences in section['data'].items() %}
|
||||
<tr>
|
||||
<td>{{ gt_ocr.split("::")[0] }}</td>
|
||||
<td>{{ gt_ocr.split("::")[1] }}</td>
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
39
src/dinglehopper/templates/report.html.js
Normal file
39
src/dinglehopper/templates/report.html.js
Normal file
|
@ -0,0 +1,39 @@
|
|||
function find_diff_class(classes) {
|
||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
/* Enable Bootstrap tooltips */
|
||||
$('[data-toggle="tooltip"]').tooltip();
|
||||
|
||||
$('.diff').mouseover(function() {
|
||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
|
||||
/* Sort this column of the table */
|
||||
$('th').click(function () {
|
||||
var table = $(this).closest('table');
|
||||
var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
|
||||
this.asc = !this.asc;
|
||||
if (!this.asc) {
|
||||
rows = rows.reverse();
|
||||
}
|
||||
for (var i = 0; i < rows.length; i++) {
|
||||
table.children('tbody').append(rows[i]);
|
||||
}
|
||||
});
|
||||
|
||||
function compareRows(index) {
|
||||
return function (row1, row2) {
|
||||
var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
|
||||
var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
|
||||
return cell1.localeCompare(cell2, undefined, {
|
||||
numeric: true,
|
||||
sensitivity: 'base'
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
|
@ -4,6 +4,12 @@
|
|||
{% if metrics %}
|
||||
"cer": {{ cer|json_float }},
|
||||
"wer": {{ wer|json_float }},
|
||||
{% endif %}
|
||||
{% if differences %}
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
},
|
||||
{% endif %}
|
||||
"n_characters": {{ n_characters }},
|
||||
"n_words": {{ n_words }}
|
136
src/dinglehopper/templates/summary.html.j2
Normal file
136
src/dinglehopper/templates/summary.html.j2
Normal file
|
@ -0,0 +1,136 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
{% if metrics %}
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
{% else %}
|
||||
.gt .diff, .ocr .diff {
|
||||
color: blue;
|
||||
}
|
||||
{% endif %}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.cer {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
tr:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
|
||||
td {
|
||||
min-width: 100px;
|
||||
}
|
||||
|
||||
td:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="container">
|
||||
|
||||
<div class="row">
|
||||
<h1>Summary of all reports</h1>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<p>Number of reports: {{ num_reports }}</p>
|
||||
</div>
|
||||
|
||||
{% if cer_avg and wer_avg -%}
|
||||
<div class="row">
|
||||
<h2>Metrics</h2>
|
||||
</div>
|
||||
|
||||
<div class="row cer">
|
||||
<p>Average CER: {{ cer_avg|round(4) }}</p>
|
||||
<p>Average WER: {{ wer_avg|round(4) }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{%- if diff_c and diff_w %}
|
||||
{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{%- for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
|
||||
</thead>
|
||||
{%- set num_omitted = namespace(value=0) -%}
|
||||
{% for gt_ocr, occurrences in section['data'].items() -%}
|
||||
{% if occurrences < occurrences_threshold -%}
|
||||
{%- set num_omitted.value = num_omitted.value + 1 %}
|
||||
{%- else -%}
|
||||
{%- set gt = gt_ocr.split(" :: ")[0] %}
|
||||
{%- set ocr = gt_ocr.split(" :: ")[1] %}
|
||||
<tr>
|
||||
<td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
|
||||
<td title="{{ ocr|urlencode }}">{{ ocr }}</td >
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{% if num_omitted.value > 0 and occurrences_threshold > 1 -%}
|
||||
<p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
|
||||
{%- set num_omitted.value = 0 %}
|
||||
{%- endif %}
|
||||
</table>
|
||||
</div>
|
||||
{%- endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
15
src/dinglehopper/templates/summary.json.j2
Normal file
15
src/dinglehopper/templates/summary.json.j2
Normal file
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"num_reports": {{ num_reports}}
|
||||
{%- if cer_avg and wer_avg %}
|
||||
,
|
||||
"cer_avg": {{ cer_avg|json_float }},
|
||||
"wer_avg": {{ wer_avg|json_float }}
|
||||
{%- endif %}
|
||||
{%- if diff_c and wer_avg %}
|
||||
,
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
}
|
||||
{%- endif %}
|
||||
}
|
File diff suppressed because it is too large
Load diff
22865
src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
22865
src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -20183,4 +20183,4 @@
|
|||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
||||
</alto>
|
|
@ -61,4 +61,4 @@
|
|||
</PrintSpace>
|
||||
</Page>
|
||||
</Layout>
|
||||
</alto>
|
||||
</alto>
|
3394
src/dinglehopper/tests/data/test.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1 +1 @@
|
|||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
Before Width: | Height: | Size: 426 KiB After Width: | Height: | Size: 426 KiB |
|
@ -6,7 +6,7 @@ import pytest
|
|||
from lxml import etree as ET
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import seq_align, ExtractedText
|
||||
from .. import ExtractedText, seq_align
|
||||
|
||||
|
||||
def test_text():
|
||||
|
@ -30,12 +30,20 @@ def test_text():
|
|||
|
||||
def test_normalization_check():
|
||||
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
|
||||
ExtractedText("foo", None, None,
|
||||
unicodedata.normalize("NFD", "Schlyñ"),
|
||||
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")))
|
||||
assert ExtractedText("foo", None, None,
|
||||
unicodedata.normalize("NFC", "Schlyñ"),
|
||||
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")))
|
||||
ExtractedText(
|
||||
"foo",
|
||||
None,
|
||||
None,
|
||||
unicodedata.normalize("NFD", "Schlyñ"),
|
||||
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
|
||||
)
|
||||
assert ExtractedText(
|
||||
"foo",
|
||||
None,
|
||||
None,
|
||||
unicodedata.normalize("NFC", "Schlyñ"),
|
||||
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
|
||||
)
|
||||
|
||||
|
||||
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
|
|
@ -1,7 +1,9 @@
|
|||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import align, distance, score_hint, seq_align
|
||||
from .util import unzip
|
||||
from .. import align, seq_align, distance, score_hint
|
||||
|
||||
|
||||
def test_left_empty():
|
||||
|
@ -72,7 +74,8 @@ def test_with_some_fake_ocr_errors():
|
|||
result = list(
|
||||
align(
|
||||
"Über die vielen Sorgen wegen desselben vergaß",
|
||||
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
||||
"SomeJunk MoreJunk "
|
||||
+ "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
||||
)
|
||||
)
|
||||
left, right = unzip(result)
|
||||
|
@ -183,6 +186,7 @@ def test_lines_similar():
|
|||
# Test __eq__ (i.e. is it a substitution or a similar string?)
|
||||
assert list(left)[0] == list(right)[0]
|
||||
|
||||
|
||||
def test_score_hint():
|
||||
assert score_hint(0.5, 23) == 12 # int(ceil())
|
||||
assert score_hint(math.inf, 12345) is None
|
|
@ -36,6 +36,7 @@ def test_character_error_rate_hard():
|
|||
len(s2) == 7
|
||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||
|
||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
||||
# Both strings have the same length in terms of grapheme clusters. So the CER should
|
||||
# be symmetrical.
|
||||
assert character_error_rate(s2, s1) == 1 / 6
|
||||
assert character_error_rate(s1, s2) == 1 / 6
|
|
@ -15,7 +15,9 @@ def test_align_page_files():
|
|||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# → 2 elements in the alignment should be different, the ligature is
|
||||
# (currently) not counted due to normalization.
|
||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
||||
#
|
||||
# NOTE: In this example, it doesn't matter that we work with "characters", not
|
||||
# grapheme clusters.
|
||||
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
28
src/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
28
src/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import alto_text, character_error_rate, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bigger_texts():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Only interested in a result here: In earlier versions this would have used
|
||||
# tens of GB of RAM and should now not break a sweat.
|
||||
assert character_error_rate(gt, ocr) >= 0.0
|
|
@ -6,7 +6,7 @@ import pytest
|
|||
from lxml import etree as ET
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .. import character_error_rate, page_text, alto_text
|
||||
from .. import alto_text, character_error_rate, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
53
src/dinglehopper/tests/test_integ_cli_dir.py
Normal file
53
src/dinglehopper/tests/test_integ_cli_dir.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
import os
|
||||
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from dinglehopper.cli import process_dir
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_directory(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir() processes a directory of files and
|
||||
yields JSON and HTML reports.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(
|
||||
os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report",
|
||||
str(tmp_path / "reports"),
|
||||
False,
|
||||
True,
|
||||
"line",
|
||||
)
|
||||
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.html")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.html")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_fail_without_gt(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir skips a file if there is no corresponding file
|
||||
in the other directory.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(
|
||||
os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report",
|
||||
str(tmp_path / "reports"),
|
||||
False,
|
||||
True,
|
||||
"line",
|
||||
)
|
||||
|
||||
assert len(os.listdir(tmp_path / "reports")) == 2 * 2
|
|
@ -1,9 +1,9 @@
|
|||
import json
|
||||
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
|
||||
from ..cli import process
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
@pytest.mark.integration
|
37
src/dinglehopper/tests/test_integ_differences.py
Normal file
37
src/dinglehopper/tests/test_integ_differences.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from dinglehopper.cli import process
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_differences(tmp_path):
|
||||
"""Test that the cli/process() yields a JSON report that includes
|
||||
the differences found between the GT and OCR text"""
|
||||
|
||||
initLogging()
|
||||
process(
|
||||
os.path.join(data_dir, "test-gt.page2018.xml"),
|
||||
os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
|
||||
"report",
|
||||
tmp_path,
|
||||
differences=True,
|
||||
)
|
||||
|
||||
assert os.path.exists(tmp_path / "report.json")
|
||||
|
||||
with open(tmp_path / "report.json", "r") as jsonf:
|
||||
j = json.load(jsonf)
|
||||
|
||||
assert j["differences"] == {
|
||||
"character_level": {"n :: m": 1, "ſ :: f": 1},
|
||||
"word_level": {
|
||||
"Augenblick :: Augemblick": 1,
|
||||
"Verſprochene :: Verfprochene": 1,
|
||||
},
|
||||
}
|
|
@ -5,7 +5,7 @@ import os
|
|||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import distance, page_text, alto_text
|
||||
from .. import alto_text, distance, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
|
@ -1,21 +1,20 @@
|
|||
import json
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
from ..ocrd_cli import ocrd_dinglehopper
|
||||
from .util import working_directory
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix")
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
|
||||
def test_ocrd_cli(tmp_path):
|
||||
"""Test OCR-D interface"""
|
||||
|
110
src/dinglehopper/tests/test_integ_summarize.py
Normal file
110
src/dinglehopper/tests/test_integ_summarize.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import cli_summarize
|
||||
from .util import working_directory
|
||||
|
||||
expected_cer_avg = (0.05 + 0.10) / 2
|
||||
expected_wer_avg = (0.15 + 0.20) / 2
|
||||
expected_diff_c = {"a": 30, "b": 50}
|
||||
expected_diff_w = {"c": 70, "d": 90}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_summaries(tmp_path):
|
||||
"""Create two summary reports with mock data"""
|
||||
reports_dirname = tmp_path / "reports"
|
||||
reports_dirname.mkdir()
|
||||
|
||||
report1 = {
|
||||
"cer": 0.05,
|
||||
"wer": 0.15,
|
||||
"differences": {
|
||||
"character_level": {"a": 10, "b": 20},
|
||||
"word_level": {"c": 30, "d": 40},
|
||||
},
|
||||
}
|
||||
report2 = {
|
||||
"cer": 0.10,
|
||||
"wer": 0.20,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50},
|
||||
},
|
||||
}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
|
||||
json.dump(report1, f)
|
||||
with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
|
||||
json.dump(report2, f)
|
||||
|
||||
return str(reports_dirname)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_json(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields a summarized JSON report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
|
||||
summary_data = json.load(f)
|
||||
|
||||
assert summary_data["num_reports"] == 2
|
||||
assert summary_data["cer_avg"] == expected_cer_avg
|
||||
assert summary_data["wer_avg"] == expected_wer_avg
|
||||
assert summary_data["differences"]["character_level"] == expected_diff_c
|
||||
assert summary_data["differences"]["word_level"] == expected_diff_w
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields an HTML report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert len(contents) > 0
|
||||
assert "Number of reports: 2" in contents
|
||||
assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
|
||||
assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
|
||||
"""
|
||||
Test that the cli/process() does not include reports that are missing a WER value.
|
||||
"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
|
||||
# This third report has no WER value and should not be included in the summary
|
||||
report3 = {
|
||||
"cer": 0.10,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50},
|
||||
},
|
||||
}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
|
||||
json.dump(report3, f)
|
||||
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert "Number of reports: 2" in contents # report3 is not included
|
|
@ -5,15 +5,15 @@ import os
|
|||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import word_error_rate, words, page_text, alto_text
|
||||
from .. import alto_text, page_text, word_error_rate, words
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_word_error_rate_between_page_files():
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
||||
# the ligature does not count → 2 errors
|
||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||
# So we have 3 changed words, the ligature does not count → 2 errors
|
||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||
|
||||
gt_word_count = (
|
|
@ -1,13 +1,11 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import lxml.etree as ET
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
import lxml.etree as ET
|
||||
|
||||
from .util import working_directory
|
||||
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
||||
from .util import working_directory
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
@ -161,7 +159,8 @@ def test_page_level():
|
|||
result = page_text(tree, textequiv_level="line")
|
||||
assert (
|
||||
result
|
||||
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
|
||||
== "Hand, Mylord? fragte der Graf von Rocheſter.\n"
|
||||
+ "Als er einsmals in dem Oberhauſe eine Bill we-"
|
||||
)
|
||||
|
||||
|
|
@ -27,7 +27,8 @@ def test_words():
|
|||
def test_words_private_use_area():
|
||||
result = list(
|
||||
words(
|
||||
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
|
||||
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
|
||||
"der Frau Amtmnnin das ver⸗\n"
|
||||
"ſproene zu berliefern."
|
||||
)
|
||||
)
|
|
@ -1,8 +1,8 @@
|
|||
import os
|
||||
from itertools import zip_longest
|
||||
from typing import Iterable
|
||||
|
||||
import colorama
|
||||
import os
|
||||
|
||||
|
||||
def diffprint(x, y):
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue