mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-08 03:10:30 +02:00
Merge branch 'master' into performance
This commit is contained in:
commit
38fcbc8e1c
101 changed files with 58154 additions and 199 deletions
|
@ -1,23 +0,0 @@
|
||||||
version: 2.1
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
test:
|
|
||||||
parameters:
|
|
||||||
python-version:
|
|
||||||
type: string
|
|
||||||
docker:
|
|
||||||
- image: cimg/python:<< parameters.python-version >>
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- run: pip3 install --upgrade pip
|
|
||||||
- run: pip3 install -r requirements.txt
|
|
||||||
- run: pip3 install pytest
|
|
||||||
- run: pytest
|
|
||||||
|
|
||||||
workflows:
|
|
||||||
all-tests:
|
|
||||||
jobs:
|
|
||||||
- test:
|
|
||||||
matrix:
|
|
||||||
parameters:
|
|
||||||
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
|
|
|
@ -15,7 +15,7 @@ indent_size = 2
|
||||||
|
|
||||||
[*.json]
|
[*.json]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
insert_final_newline = false
|
insert_final_newline = true
|
||||||
|
|
||||||
# trailing spaces in markdown indicate word wrap
|
# trailing spaces in markdown indicate word wrap
|
||||||
[*.md]
|
[*.md]
|
||||||
|
|
14
.github/workflows/release-check-version-tag
vendored
Executable file
14
.github/workflows/release-check-version-tag
vendored
Executable file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# We call setuptools.setup() here as we may rely on setuptools to interpret
|
||||||
|
# a dynamic version field. (Reading pyproject.toml is not enough in that case.)
|
||||||
|
expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
|
||||||
|
actual_git_tag="$(git describe --tags)"
|
||||||
|
|
||||||
|
if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
|
||||||
|
echo "OK: Python package version $expected_git_tag matches git tag"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
|
||||||
|
exit 1
|
||||||
|
fi
|
69
.github/workflows/release.yml
vendored
Normal file
69
.github/workflows/release.yml
vendored
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
name: release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- "v*.*.*"
|
||||||
|
|
||||||
|
env:
|
||||||
|
PYPI_URL: https://pypi.org/p/dinglehopper
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
uses: ./.github/workflows/test.yml
|
||||||
|
|
||||||
|
build:
|
||||||
|
needs: test
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Upgrade pip
|
||||||
|
run: python3 -m pip install --upgrade pip
|
||||||
|
- name: Install setuptools
|
||||||
|
run: |
|
||||||
|
python3 -m pip install --upgrade setuptools
|
||||||
|
# For OCR-D tools, we need setuptools-ocrd to get the version
|
||||||
|
if [ -e ocrd-tool.json ]; then
|
||||||
|
python3 -m pip install setuptools-ocrd
|
||||||
|
fi
|
||||||
|
- name: Check git tag vs package version
|
||||||
|
run: .github/workflows/release-check-version-tag
|
||||||
|
- name: Build package
|
||||||
|
run: python3 -m pip install --upgrade build && python3 -m build
|
||||||
|
- name: Upload dist
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: dist
|
||||||
|
path: dist/
|
||||||
|
|
||||||
|
github-release:
|
||||||
|
needs: build
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Download dist
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: dist
|
||||||
|
path: dist/
|
||||||
|
- name: Create release on GitHub
|
||||||
|
uses: softprops/action-gh-release@v1
|
||||||
|
with:
|
||||||
|
files: dist/*
|
||||||
|
|
||||||
|
pypi-publish:
|
||||||
|
needs: build
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment:
|
||||||
|
name: pypi
|
||||||
|
url: ${{ env.PYPI_URL }}
|
||||||
|
permissions:
|
||||||
|
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
||||||
|
steps:
|
||||||
|
- name: Download dist
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: dist
|
||||||
|
path: dist/
|
||||||
|
- name: Publish package distributions to PyPI
|
||||||
|
uses: pypa/gh-action-pypi-publish@release/v1
|
76
.github/workflows/test.yml
vendored
Normal file
76
.github/workflows/test.yml
vendored
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
name: test
|
||||||
|
|
||||||
|
on:
|
||||||
|
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
schedule:
|
||||||
|
- cron: "00 16 07 * *" # = monthly
|
||||||
|
|
||||||
|
# Allow manually running (from GitHub Web)
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
# Allow calling this workflow (e.g. from release workflow)
|
||||||
|
workflow_call:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
|
||||||
|
|
||||||
|
# For Python 3.6, we need to fall back to Ubuntu 20.04
|
||||||
|
runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
|
||||||
|
|
||||||
|
env:
|
||||||
|
test_results_dir: test-results-${{ matrix.python-version }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Update pip
|
||||||
|
run: python3 -m pip install -U pip
|
||||||
|
- name: Avoid compiling OpenCV and NumPy on Python 3.6
|
||||||
|
run: |
|
||||||
|
if python3 --version | grep -q "Python 3.6"; then
|
||||||
|
pip install --prefer-binary -U opencv-python-headless numpy
|
||||||
|
fi
|
||||||
|
- name: Install requirements*.txt
|
||||||
|
run: |
|
||||||
|
for requirements_txt in requirements*.txt; do
|
||||||
|
python3 -m pip install -r $requirements_txt;
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: |
|
||||||
|
cd src
|
||||||
|
mkdir -p ../$test_results_dir
|
||||||
|
python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
|
||||||
|
- name: Upload test results
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
if: success() || failure()
|
||||||
|
with:
|
||||||
|
name: ${{ env.test_results_dir }}
|
||||||
|
path: ${{ env.test_results_dir }}
|
||||||
|
|
||||||
|
- name: Report tests
|
||||||
|
uses: dorny/test-reporter@v1
|
||||||
|
if: success() || failure()
|
||||||
|
with:
|
||||||
|
name: Results on Python ${{ matrix.python-version }}
|
||||||
|
path: "${{env.test_results_dir }}/junit.xml"
|
||||||
|
reporter: java-junit
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -16,6 +16,7 @@ htmlcov/
|
||||||
.venv
|
.venv
|
||||||
env/
|
env/
|
||||||
venv/
|
venv/
|
||||||
|
.python-version
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
@ -27,3 +28,4 @@ dmypy.json
|
||||||
|
|
||||||
# Build artifacts
|
# Build artifacts
|
||||||
/build
|
/build
|
||||||
|
/dist
|
||||||
|
|
36
.pre-commit-config.yaml
Normal file
36
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v4.5.0
|
||||||
|
hooks:
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: check-json
|
||||||
|
- id: check-toml
|
||||||
|
- id: check-yaml
|
||||||
|
- id: check-added-large-files
|
||||||
|
- id: check-ast
|
||||||
|
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 23.10.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.1.1
|
||||||
|
hooks:
|
||||||
|
- args:
|
||||||
|
- --fix
|
||||||
|
- --exit-non-zero-on-fix
|
||||||
|
id: ruff
|
||||||
|
|
||||||
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||||
|
rev: v1.6.1
|
||||||
|
hooks:
|
||||||
|
- additional_dependencies:
|
||||||
|
- types-setuptools
|
||||||
|
id: mypy
|
||||||
|
|
||||||
|
- repo: https://gitlab.com/vojko.pribudic/pre-commit-update
|
||||||
|
rev: v0.1.0
|
||||||
|
hooks:
|
||||||
|
- id: pre-commit-update
|
|
@ -1,6 +1,6 @@
|
||||||
Testing
|
Testing
|
||||||
=======
|
=======
|
||||||
Use `pytest` to run the tests in [the tests directory](qurator/dinglehopper/tests):
|
Use `pytest` to run the tests in [the tests directory](dinglehopper/tests):
|
||||||
```bash
|
```bash
|
||||||
virtualenv -p /usr/bin/python3 venv
|
virtualenv -p /usr/bin/python3 venv
|
||||||
. venv/bin/activate
|
. venv/bin/activate
|
||||||
|
@ -10,6 +10,7 @@ pytest
|
||||||
```
|
```
|
||||||
|
|
||||||
## Test running examples
|
## Test running examples
|
||||||
|
|
||||||
Only unit tests:
|
Only unit tests:
|
||||||
```bash
|
```bash
|
||||||
pytest -m "not integration"
|
pytest -m "not integration"
|
||||||
|
@ -27,11 +28,18 @@ pytest
|
||||||
|
|
||||||
All tests with code coverage:
|
All tests with code coverage:
|
||||||
```bash
|
```bash
|
||||||
pytest --cov=qurator --cov-report=html
|
pytest --cov=dinglehopper --cov-report=html
|
||||||
```
|
```
|
||||||
|
|
||||||
Static code analysis:
|
Static code analysis:
|
||||||
```bash
|
```bash
|
||||||
pytest -k "not test" --flake8
|
|
||||||
pytest -k "not test" --mypy
|
pytest -k "not test" --mypy
|
||||||
|
pytest -k "not test" --ruff
|
||||||
```
|
```
|
||||||
|
|
||||||
|
# How to use pre-commit
|
||||||
|
|
||||||
|
This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
|
||||||
|
|
||||||
|
- Install pre-commit, e.g. `pip install -r requirements-dev.txt`
|
||||||
|
- Install the repo-local git hooks: `pre-commit install`
|
||||||
|
|
72
README.md
72
README.md
|
@ -5,9 +5,13 @@ dinglehopper is an OCR evaluation tool and reads
|
||||||
[ALTO](https://github.com/altoxml),
|
[ALTO](https://github.com/altoxml),
|
||||||
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
|
[PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files. It
|
||||||
compares a ground truth (GT) document page with a OCR result page to compute
|
compares a ground truth (GT) document page with a OCR result page to compute
|
||||||
metrics and a word/character differences report.
|
metrics and a word/character differences report. It also supports batch processing by
|
||||||
|
generating, aggregating and summarizing multiple reports.
|
||||||
|
|
||||||
[](https://circleci.com/gh/qurator-spk/dinglehopper)
|
[](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
|
||||||
|
[](https://github.com/qurator-spk/dinglehopper/releases/)
|
||||||
|
[](#license)
|
||||||
|
[](https://github.com/qurator-spk/dinglehopper/issues)
|
||||||
|
|
||||||
Goals
|
Goals
|
||||||
-----
|
-----
|
||||||
|
@ -19,15 +23,16 @@ Goals
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
------------
|
------------
|
||||||
It's best to use pip, e.g.:
|
|
||||||
~~~
|
It's best to use pip to install the package from PyPI, e.g.:
|
||||||
sudo pip install .
|
```
|
||||||
~~~
|
pip install dinglehopper
|
||||||
|
```
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
~~~
|
~~~
|
||||||
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
|
||||||
|
|
||||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||||
|
|
||||||
|
@ -35,19 +40,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
|
||||||
their text and falls back to plain text if no ALTO or PAGE is detected.
|
their text and falls back to plain text if no ALTO or PAGE is detected.
|
||||||
|
|
||||||
The files GT and OCR are usually a ground truth document and the result of
|
The files GT and OCR are usually a ground truth document and the result of
|
||||||
an OCR software, but you may use dinglehopper to compare two OCR results.
|
an OCR software, but you may use dinglehopper to compare two OCR results. In
|
||||||
In that case, use --no-metrics to disable the then meaningless metrics and
|
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||||
also change the color scheme from green/red to blue.
|
change the color scheme from green/red to blue.
|
||||||
|
|
||||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
The comparison report will be written to
|
||||||
$REPORT_PREFIX defaults to "report". The reports include the character
|
$REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
|
||||||
error rate (CER) and the word error rate (WER).
|
to the current working directory and $REPORT_PREFIX defaults to "report".
|
||||||
|
The reports include the character error rate (CER) and the word error rate
|
||||||
|
(WER).
|
||||||
|
|
||||||
By default, the text of PAGE files is extracted on 'region' level. You may
|
By default, the text of PAGE files is extracted on 'region' level. You may
|
||||||
use "--textequiv-level line" to extract from the level of TextLine tags.
|
use "--textequiv-level line" to extract from the level of TextLine tags.
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--metrics / --no-metrics Enable/disable metrics and green/red
|
--metrics / --no-metrics Enable/disable metrics and green/red
|
||||||
|
--differences BOOLEAN Enable reporting character and word level
|
||||||
|
differences
|
||||||
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
|
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
|
||||||
--progress Show progress bar
|
--progress Show progress bar
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
|
@ -61,6 +70,43 @@ This generates `report.html` and `report.json`.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
Batch comparison between folders of GT and OCR files can be done by simply providing
|
||||||
|
folders:
|
||||||
|
~~~
|
||||||
|
dinglehopper gt/ ocr/ report output_folder/
|
||||||
|
~~~
|
||||||
|
This assumes that you have files with the same name in both folders, e.g.
|
||||||
|
`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
|
||||||
|
|
||||||
|
The example generates reports for each set of files, with the prefix `report`, in the
|
||||||
|
(automatically created) folder `output_folder/`.
|
||||||
|
|
||||||
|
By default, the JSON report does not contain the character and word differences, only
|
||||||
|
the calculated metrics. If you want to include the differences, use the
|
||||||
|
`--differences` flag:
|
||||||
|
|
||||||
|
~~~
|
||||||
|
dinglehopper gt/ ocr/ report output_folder/ --differences
|
||||||
|
~~~
|
||||||
|
|
||||||
|
### dinglehopper-summarize
|
||||||
|
A set of (JSON) reports can be summarized into a single set of
|
||||||
|
reports. This is useful after having generated reports in batch.
|
||||||
|
Example:
|
||||||
|
~~~
|
||||||
|
dinglehopper-summarize output_folder/
|
||||||
|
~~~
|
||||||
|
This generates `summary.html` and `summary.json` in the same `output_folder`.
|
||||||
|
|
||||||
|
If you are summarizing many reports and have used the `--differences` flag while
|
||||||
|
generating them, it may be useful to limit the number of differences reported by using
|
||||||
|
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
|
||||||
|
report, making it easier to open and navigate. Note that the JSON report will still
|
||||||
|
contain all differences. Example:
|
||||||
|
~~~
|
||||||
|
dinglehopper-summarize output_folder/ --occurences-threshold 10
|
||||||
|
~~~
|
||||||
|
|
||||||
### dinglehopper-line-dirs
|
### dinglehopper-line-dirs
|
||||||
You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
|
You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
|
||||||
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
|
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
qurator/dinglehopper/ocrd-tool.json
|
src/dinglehopper/ocrd-tool.json
|
70
pyproject.toml
Normal file
70
pyproject.toml
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "dinglehopper"
|
||||||
|
authors = [
|
||||||
|
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
|
||||||
|
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
|
||||||
|
]
|
||||||
|
description = "The OCR evaluation tool"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.6"
|
||||||
|
keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
|
||||||
|
|
||||||
|
dynamic = ["version", "dependencies", "optional-dependencies"]
|
||||||
|
|
||||||
|
# https://pypi.org/classifiers/
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 5 - Production/Stable",
|
||||||
|
"Environment :: Console",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"Intended Audience :: Other Audience",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3 :: Only",
|
||||||
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
||||||
|
"Topic :: Text Processing",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
dinglehopper = "dinglehopper.cli:main"
|
||||||
|
dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
|
||||||
|
dinglehopper-extract = "dinglehopper.cli_extract:main"
|
||||||
|
dinglehopper-summarize = "dinglehopper.cli_summarize:main"
|
||||||
|
ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
|
||||||
|
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/qurator-spk/dinglehopper"
|
||||||
|
Repository = "https://github.com/qurator-spk/dinglehopper.git"
|
||||||
|
|
||||||
|
|
||||||
|
[tool.setuptools.dynamic]
|
||||||
|
dependencies = {file = ["requirements.txt"]}
|
||||||
|
optional-dependencies.dev = {file = ["requirements-dev.txt"]}
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
|
|
||||||
|
[tool.setuptools.package-data]
|
||||||
|
dinglehopper = ["templates/*"]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
minversion = 6.0
|
||||||
|
addopts = "--strict-markers"
|
||||||
|
markers = [
|
||||||
|
"integration: integration tests",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
select = ["E", "F", "I"]
|
||||||
|
ignore = [
|
||||||
|
"F811", # multimethods are considered redefinitions by ruff
|
||||||
|
]
|
|
@ -1,4 +0,0 @@
|
||||||
[pytest]
|
|
||||||
markers =
|
|
||||||
integration: integration tests
|
|
||||||
serial
|
|
|
@ -1 +0,0 @@
|
||||||
__import__("pkg_resources").declare_namespace(__name__)
|
|
|
@ -1,5 +0,0 @@
|
||||||
from .ocr_files import *
|
|
||||||
from .extracted_text import *
|
|
||||||
from .character_error_rate import *
|
|
||||||
from .word_error_rate import *
|
|
||||||
from .align import *
|
|
|
@ -1,15 +0,0 @@
|
||||||
function find_diff_class(classes) {
|
|
||||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
|
||||||
}
|
|
||||||
|
|
||||||
$(document).ready(function() {
|
|
||||||
/* Enable Bootstrap tooltips */
|
|
||||||
$('[data-toggle="tooltip"]').tooltip();
|
|
||||||
|
|
||||||
$('.diff').mouseover(function() {
|
|
||||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
|
||||||
});
|
|
||||||
$('.diff').mouseout(function() {
|
|
||||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
|
||||||
});
|
|
||||||
});
|
|
|
@ -1,5 +1,8 @@
|
||||||
pytest
|
pytest
|
||||||
pytest-flake8
|
|
||||||
pytest-cov
|
pytest-cov
|
||||||
pytest-mypy
|
pytest-mypy
|
||||||
black
|
black
|
||||||
|
pre-commit
|
||||||
|
|
||||||
|
ruff ; python_version >= "3.7"
|
||||||
|
pytest-ruff ; python_version >= "3.7"
|
||||||
|
|
|
@ -10,4 +10,4 @@ attrs
|
||||||
multimethod >= 1.3
|
multimethod >= 1.3
|
||||||
tqdm
|
tqdm
|
||||||
rapidfuzz >= 2.7.0
|
rapidfuzz >= 2.7.0
|
||||||
six # XXX workaround OCR-D/core#730
|
chardet
|
||||||
|
|
12
setup.cfg
12
setup.cfg
|
@ -1,12 +0,0 @@
|
||||||
[flake8]
|
|
||||||
max-line-length = 88
|
|
||||||
extend-ignore = E203, W503
|
|
||||||
|
|
||||||
[pylint]
|
|
||||||
max-line-length = 88
|
|
||||||
|
|
||||||
[pylint.messages_control]
|
|
||||||
disable = C0330, C0326
|
|
||||||
|
|
||||||
[mypy]
|
|
||||||
ignore_missing_imports = True
|
|
34
setup.py
34
setup.py
|
@ -1,34 +0,0 @@
|
||||||
from io import open
|
|
||||||
from setuptools import find_packages, setup
|
|
||||||
|
|
||||||
with open("requirements.txt") as fp:
|
|
||||||
install_requires = fp.read()
|
|
||||||
|
|
||||||
with open('requirements-dev.txt') as fp:
|
|
||||||
tests_require = fp.read()
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="dinglehopper",
|
|
||||||
author="Mike Gerber, The QURATOR SPK Team",
|
|
||||||
author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
|
|
||||||
description="The OCR evaluation tool",
|
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
keywords="qurator ocr",
|
|
||||||
license="Apache",
|
|
||||||
namespace_packages=["qurator"],
|
|
||||||
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
|
||||||
install_requires=install_requires,
|
|
||||||
tests_require=tests_require,
|
|
||||||
package_data={
|
|
||||||
"": ["*.json", "templates/*"],
|
|
||||||
},
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": [
|
|
||||||
"dinglehopper=qurator.dinglehopper.cli:main",
|
|
||||||
"dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
|
|
||||||
"dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
|
|
||||||
"ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
|
|
||||||
]
|
|
||||||
},
|
|
||||||
)
|
|
33
src/dinglehopper/__init__.py
Normal file
33
src/dinglehopper/__init__.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from .align import align, score_hint, seq_align
|
||||||
|
from .character_error_rate import character_error_rate, character_error_rate_n
|
||||||
|
from .edit_distance import distance, editops
|
||||||
|
from .extracted_text import ExtractedText
|
||||||
|
from .ocr_files import (
|
||||||
|
alto_namespace,
|
||||||
|
alto_text,
|
||||||
|
page_namespace,
|
||||||
|
page_text,
|
||||||
|
plain_text,
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
from .word_error_rate import word_error_rate, word_error_rate_n, words
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"editops",
|
||||||
|
"distance",
|
||||||
|
"align",
|
||||||
|
"score_hint",
|
||||||
|
"seq_align",
|
||||||
|
"character_error_rate",
|
||||||
|
"character_error_rate_n",
|
||||||
|
"word_error_rate",
|
||||||
|
"word_error_rate_n",
|
||||||
|
"words",
|
||||||
|
"ExtractedText",
|
||||||
|
"alto_namespace",
|
||||||
|
"alto_text",
|
||||||
|
"page_namespace",
|
||||||
|
"page_text",
|
||||||
|
"plain_text",
|
||||||
|
"text",
|
||||||
|
]
|
|
@ -1,9 +1,12 @@
|
||||||
import math
|
import math
|
||||||
|
import unicodedata
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
|
||||||
from .edit_distance import *
|
|
||||||
from rapidfuzz.distance import Levenshtein
|
from rapidfuzz.distance import Levenshtein
|
||||||
|
|
||||||
|
from .edit_distance import grapheme_clusters
|
||||||
|
|
||||||
|
|
||||||
def align(t1, t2):
|
def align(t1, t2):
|
||||||
"""Align text."""
|
"""Align text."""
|
||||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
|
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
|
|
@ -1,20 +1,22 @@
|
||||||
import os
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from markupsafe import escape
|
from markupsafe import escape
|
||||||
from ocrd_utils import initLogging
|
from ocrd_utils import initLogging
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
from .character_error_rate import character_error_rate_n
|
from dinglehopper.align import score_hint, seq_align
|
||||||
from .word_error_rate import word_error_rate_n, words_normalized
|
from dinglehopper.character_error_rate import character_error_rate_n
|
||||||
from .align import seq_align, score_hint
|
from dinglehopper.config import Config
|
||||||
from .extracted_text import ExtractedText
|
from dinglehopper.extracted_text import ExtractedText
|
||||||
from .ocr_files import extract
|
from dinglehopper.ocr_files import extract
|
||||||
from .config import Config
|
from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
|
||||||
|
|
||||||
|
|
||||||
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
def gen_diff_report(
|
||||||
|
gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
|
||||||
|
):
|
||||||
gtx = ""
|
gtx = ""
|
||||||
ocrx = ""
|
ocrx = ""
|
||||||
|
|
||||||
|
@ -31,16 +33,12 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
||||||
|
|
||||||
# Set Bootstrap tooltip to the segment id
|
# Set Bootstrap tooltip to the segment id
|
||||||
if id_:
|
if id_:
|
||||||
html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)
|
html_custom_attrs += f'data-toggle="tooltip" title="{id_}"'
|
||||||
|
|
||||||
if css_classes:
|
if css_classes:
|
||||||
return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
|
return f'<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'
|
||||||
css_classes=css_classes,
|
|
||||||
html_t=html_t,
|
|
||||||
html_custom_attrs=html_custom_attrs,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
return "{html_t}".format(html_t=html_t)
|
return f"{html_t}"
|
||||||
|
|
||||||
if isinstance(gt_in, ExtractedText):
|
if isinstance(gt_in, ExtractedText):
|
||||||
if not isinstance(ocr_in, ExtractedText):
|
if not isinstance(ocr_in, ExtractedText):
|
||||||
|
@ -53,6 +51,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
||||||
|
|
||||||
g_pos = 0
|
g_pos = 0
|
||||||
o_pos = 0
|
o_pos = 0
|
||||||
|
found_differences = []
|
||||||
|
|
||||||
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
|
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
|
||||||
css_classes = None
|
css_classes = None
|
||||||
gt_id = None
|
gt_id = None
|
||||||
|
@ -65,6 +65,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
||||||
# Deletions and inserts only produce one id + None, UI must
|
# Deletions and inserts only produce one id + None, UI must
|
||||||
# support this, i.e. display for the one id produced
|
# support this, i.e. display for the one id produced
|
||||||
|
|
||||||
|
if differences:
|
||||||
|
found_differences.append(f"{g} :: {o}")
|
||||||
|
|
||||||
gtx += joiner + format_thing(g, css_classes, gt_id)
|
gtx += joiner + format_thing(g, css_classes, gt_id)
|
||||||
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
ocrx += joiner + format_thing(o, css_classes, ocr_id)
|
||||||
|
|
||||||
|
@ -73,13 +76,18 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None):
|
||||||
if o is not None:
|
if o is not None:
|
||||||
o_pos += len(o)
|
o_pos += len(o)
|
||||||
|
|
||||||
return """
|
found_differences = dict(Counter(elem for elem in found_differences))
|
||||||
|
|
||||||
|
return (
|
||||||
|
"""
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-md-6 gt">{}</div>
|
<div class="col-md-6 gt">{}</div>
|
||||||
<div class="col-md-6 ocr">{}</div>
|
<div class="col-md-6 ocr">{}</div>
|
||||||
</div>
|
</div>
|
||||||
""".format(
|
""".format(
|
||||||
gtx, ocrx
|
gtx, ocrx
|
||||||
|
),
|
||||||
|
found_differences,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,11 +104,20 @@ def json_float(value):
|
||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
def process(
|
||||||
|
gt,
|
||||||
|
ocr,
|
||||||
|
report_prefix,
|
||||||
|
reports_folder=".",
|
||||||
|
*,
|
||||||
|
metrics=True,
|
||||||
|
differences=False,
|
||||||
|
textequiv_level="region",
|
||||||
|
):
|
||||||
"""Check OCR result against GT.
|
"""Check OCR result against GT.
|
||||||
|
|
||||||
The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
|
The @click decorators change the signature of the decorated functions, so we keep
|
||||||
Click on a wrapper.
|
this undecorated version and use Click on a wrapper.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gt_text = extract(gt, textequiv_level=textequiv_level)
|
gt_text = extract(gt, textequiv_level=textequiv_level)
|
||||||
|
@ -109,15 +126,25 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||||
ocr_words = words_normalized(ocr_text)
|
ocr_words = words_normalized(ocr_text)
|
||||||
|
|
||||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||||
char_diff_report = gen_diff_report(
|
char_diff_report, diff_c = gen_diff_report(
|
||||||
gt_text, ocr_text, css_prefix="c", joiner="", none="·",
|
gt_text,
|
||||||
score_hint=score_hint(cer, n_characters)
|
ocr_text,
|
||||||
|
css_prefix="c",
|
||||||
|
joiner="",
|
||||||
|
none="·",
|
||||||
|
score_hint=score_hint(cer, n_characters),
|
||||||
|
differences=differences,
|
||||||
)
|
)
|
||||||
|
|
||||||
wer, n_words = word_error_rate_n(gt_words, ocr_words)
|
wer, n_words = word_error_rate_n(gt_words, ocr_words)
|
||||||
word_diff_report = gen_diff_report(
|
word_diff_report, diff_w = gen_diff_report(
|
||||||
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
|
gt_words,
|
||||||
score_hint=score_hint(wer, n_words)
|
ocr_words,
|
||||||
|
css_prefix="w",
|
||||||
|
joiner=" ",
|
||||||
|
none="⋯",
|
||||||
|
score_hint=score_hint(wer, n_words),
|
||||||
|
differences=differences,
|
||||||
)
|
)
|
||||||
|
|
||||||
env = Environment(
|
env = Environment(
|
||||||
|
@ -129,7 +156,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||||
|
|
||||||
for report_suffix in (".html", ".json"):
|
for report_suffix in (".html", ".json"):
|
||||||
template_fn = "report" + report_suffix + ".j2"
|
template_fn = "report" + report_suffix + ".j2"
|
||||||
out_fn = report_prefix + report_suffix
|
|
||||||
|
if not os.path.isdir(reports_folder):
|
||||||
|
os.mkdir(reports_folder)
|
||||||
|
|
||||||
|
out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
|
||||||
|
|
||||||
template = env.get_template(template_fn)
|
template = env.get_template(template_fn)
|
||||||
template.stream(
|
template.stream(
|
||||||
|
@ -142,16 +173,46 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||||
char_diff_report=char_diff_report,
|
char_diff_report=char_diff_report,
|
||||||
word_diff_report=word_diff_report,
|
word_diff_report=word_diff_report,
|
||||||
metrics=metrics,
|
metrics=metrics,
|
||||||
|
differences=differences,
|
||||||
|
diff_c=diff_c,
|
||||||
|
diff_w=diff_w,
|
||||||
).dump(out_fn)
|
).dump(out_fn)
|
||||||
|
|
||||||
|
|
||||||
|
def process_dir(
|
||||||
|
gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
|
||||||
|
):
|
||||||
|
for gt_file in os.listdir(gt):
|
||||||
|
gt_file_path = os.path.join(gt, gt_file)
|
||||||
|
ocr_file_path = os.path.join(ocr, gt_file)
|
||||||
|
|
||||||
|
if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
|
||||||
|
process(
|
||||||
|
gt_file_path,
|
||||||
|
ocr_file_path,
|
||||||
|
f"{gt_file}-{report_prefix}",
|
||||||
|
reports_folder=reports_folder,
|
||||||
|
metrics=metrics,
|
||||||
|
differences=differences,
|
||||||
|
textequiv_level=textequiv_level,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument("gt", type=click.Path(exists=True))
|
@click.argument("gt", type=click.Path(exists=True))
|
||||||
@click.argument("ocr", type=click.Path(exists=True))
|
@click.argument("ocr", type=click.Path(exists=True))
|
||||||
@click.argument("report_prefix", type=click.Path(), default="report")
|
@click.argument("report_prefix", type=click.Path(), default="report")
|
||||||
|
@click.argument("reports_folder", type=click.Path(), default=".")
|
||||||
@click.option(
|
@click.option(
|
||||||
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--differences",
|
||||||
|
default=False,
|
||||||
|
help="Enable reporting character and word level differences",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--textequiv-level",
|
"--textequiv-level",
|
||||||
default="region",
|
default="region",
|
||||||
|
@ -159,7 +220,16 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
|
||||||
metavar="LEVEL",
|
metavar="LEVEL",
|
||||||
)
|
)
|
||||||
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
|
||||||
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
def main(
|
||||||
|
gt,
|
||||||
|
ocr,
|
||||||
|
report_prefix,
|
||||||
|
reports_folder,
|
||||||
|
metrics,
|
||||||
|
differences,
|
||||||
|
textequiv_level,
|
||||||
|
progress,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Compare the PAGE/ALTO/text document GT against the document OCR.
|
Compare the PAGE/ALTO/text document GT against the document OCR.
|
||||||
|
|
||||||
|
@ -171,7 +241,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||||
that case, use --no-metrics to disable the then meaningless metrics and also
|
that case, use --no-metrics to disable the then meaningless metrics and also
|
||||||
change the color scheme from green/red to blue.
|
change the color scheme from green/red to blue.
|
||||||
|
|
||||||
The comparison report will be written to $REPORT_PREFIX.{html,json}, where
|
The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
|
||||||
|
where $REPORTS_FOLDER defaults to the current working directory and
|
||||||
$REPORT_PREFIX defaults to "report". The reports include the character error
|
$REPORT_PREFIX defaults to "report". The reports include the character error
|
||||||
rate (CER) and the word error rate (WER).
|
rate (CER) and the word error rate (WER).
|
||||||
|
|
||||||
|
@ -180,7 +251,31 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
|
||||||
"""
|
"""
|
||||||
initLogging()
|
initLogging()
|
||||||
Config.progress = progress
|
Config.progress = progress
|
||||||
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
|
if os.path.isdir(gt):
|
||||||
|
if not os.path.isdir(ocr):
|
||||||
|
raise click.BadParameter(
|
||||||
|
"OCR must be a directory if GT is a directory", param_hint="ocr"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
process_dir(
|
||||||
|
gt,
|
||||||
|
ocr,
|
||||||
|
report_prefix,
|
||||||
|
reports_folder,
|
||||||
|
metrics,
|
||||||
|
differences,
|
||||||
|
textequiv_level,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
process(
|
||||||
|
gt,
|
||||||
|
ocr,
|
||||||
|
report_prefix,
|
||||||
|
reports_folder,
|
||||||
|
metrics=metrics,
|
||||||
|
differences=differences,
|
||||||
|
textequiv_level=textequiv_level,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
|
@ -1,15 +1,15 @@
|
||||||
import os
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import os
|
||||||
|
|
||||||
import click
|
import click
|
||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from ocrd_utils import initLogging
|
from ocrd_utils import initLogging
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
|
from .align import score_hint
|
||||||
from .character_error_rate import character_error_rate_n
|
from .character_error_rate import character_error_rate_n
|
||||||
from .word_error_rate import word_error_rate_n, words_normalized
|
|
||||||
from .ocr_files import plain_extract
|
|
||||||
from .cli import gen_diff_report, json_float
|
from .cli import gen_diff_report, json_float
|
||||||
|
from .ocr_files import plain_extract
|
||||||
|
from .word_error_rate import word_error_rate_n, words_normalized
|
||||||
|
|
||||||
|
|
||||||
def all_equal(iterable):
|
def all_equal(iterable):
|
||||||
|
@ -75,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
|
||||||
|
|
||||||
# Generate diff reports
|
# Generate diff reports
|
||||||
char_diff_report += gen_diff_report(
|
char_diff_report += gen_diff_report(
|
||||||
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·",
|
gt_text,
|
||||||
score_hint=score_hint(l_cer, l_n_characters)
|
ocr_text,
|
||||||
|
css_prefix="l{0}-c".format(k),
|
||||||
|
joiner="",
|
||||||
|
none="·",
|
||||||
|
score_hint=score_hint(l_cer, l_n_characters),
|
||||||
)
|
)
|
||||||
word_diff_report += gen_diff_report(
|
word_diff_report += gen_diff_report(
|
||||||
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯",
|
gt_words,
|
||||||
score_hint=score_hint(l_wer, l_n_words)
|
ocr_words,
|
||||||
|
css_prefix="l{0}-w".format(k),
|
||||||
|
joiner=" ",
|
||||||
|
none="⋯",
|
||||||
|
score_hint=score_hint(l_wer, l_n_words),
|
||||||
)
|
)
|
||||||
|
|
||||||
env = Environment(
|
env = Environment(
|
106
src/dinglehopper/cli_summarize.py
Normal file
106
src/dinglehopper/cli_summarize.py
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import click
|
||||||
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
from ocrd_utils import initLogging
|
||||||
|
|
||||||
|
from dinglehopper.cli import json_float
|
||||||
|
|
||||||
|
|
||||||
|
def process(reports_folder, occurrences_threshold=1):
|
||||||
|
cer_list = []
|
||||||
|
wer_list = []
|
||||||
|
cer_sum = 0
|
||||||
|
wer_sum = 0
|
||||||
|
diff_c = {}
|
||||||
|
diff_w = {}
|
||||||
|
|
||||||
|
for report in os.listdir(reports_folder):
|
||||||
|
if report.endswith(".json"):
|
||||||
|
with open(os.path.join(reports_folder, report), "r") as f:
|
||||||
|
report_data = json.load(f)
|
||||||
|
|
||||||
|
if "cer" not in report_data or "wer" not in report_data:
|
||||||
|
click.echo(
|
||||||
|
f"Skipping {report} because it does not contain CER and WER"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
cer = report_data["cer"]
|
||||||
|
wer = report_data["wer"]
|
||||||
|
cer_list.append(cer)
|
||||||
|
wer_list.append(wer)
|
||||||
|
cer_sum += cer
|
||||||
|
wer_sum += wer
|
||||||
|
|
||||||
|
try:
|
||||||
|
for key, value in report_data["differences"][
|
||||||
|
"character_level"
|
||||||
|
].items():
|
||||||
|
diff_c[key] = diff_c.get(key, 0) + value
|
||||||
|
for key, value in report_data["differences"]["word_level"].items():
|
||||||
|
diff_w[key] = diff_w.get(key, 0) + value
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if len(cer_list) == 0:
|
||||||
|
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
|
||||||
|
return
|
||||||
|
|
||||||
|
cer_avg = cer_sum / len(cer_list)
|
||||||
|
wer_avg = wer_sum / len(wer_list)
|
||||||
|
|
||||||
|
print(f"Number of reports: {len(cer_list)}")
|
||||||
|
print(f"Average CER: {cer_avg}")
|
||||||
|
print(f"Average WER: {wer_avg}")
|
||||||
|
print(f"Sum of common mistakes: {cer_sum}")
|
||||||
|
print(f"Sum of common mistakes: {wer_sum}")
|
||||||
|
|
||||||
|
env = Environment(
|
||||||
|
loader=FileSystemLoader(
|
||||||
|
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
env.filters["json_float"] = json_float
|
||||||
|
for report_suffix in (".html", ".json"):
|
||||||
|
template_fn = "summary" + report_suffix + ".j2"
|
||||||
|
|
||||||
|
out_fn = os.path.join(reports_folder, "summary" + report_suffix)
|
||||||
|
template = env.get_template(template_fn)
|
||||||
|
template.stream(
|
||||||
|
num_reports=len(cer_list),
|
||||||
|
cer_avg=cer_avg,
|
||||||
|
wer_avg=wer_avg,
|
||||||
|
diff_c=diff_c,
|
||||||
|
diff_w=diff_w,
|
||||||
|
occurrences_threshold=occurrences_threshold,
|
||||||
|
).dump(out_fn)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
|
||||||
|
@click.option(
|
||||||
|
"--occurrences-threshold",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Only show differences that occur at least this many times.",
|
||||||
|
)
|
||||||
|
def main(reports_folder, occurrences_threshold):
|
||||||
|
"""
|
||||||
|
Summarize the results from multiple reports generated earlier by dinglehopper.
|
||||||
|
It calculates the average CER and WER, as well as a sum of common mistakes.
|
||||||
|
Reports include lists of mistakes and their occurrences.
|
||||||
|
|
||||||
|
You may use a threshold to reduce the file size of the HTML report by only showing
|
||||||
|
mistakes whose number of occurrences is above the threshold. The JSON report will
|
||||||
|
always contain all mistakes.
|
||||||
|
|
||||||
|
All JSON files in the provided folder will be gathered and summarized.
|
||||||
|
"""
|
||||||
|
initLogging()
|
||||||
|
process(reports_folder, occurrences_threshold)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -1,8 +1,8 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
|
||||||
from rapidfuzz.distance import Levenshtein
|
from rapidfuzz.distance import Levenshtein
|
||||||
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
|
|
|
@ -2,6 +2,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
from typing import Iterator
|
from typing import Iterator
|
||||||
|
|
||||||
|
import chardet
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
@ -12,8 +13,8 @@ from .extracted_text import ExtractedText, normalize_sbb
|
||||||
def alto_namespace(tree: ET.ElementTree) -> str:
|
def alto_namespace(tree: ET.ElementTree) -> str:
|
||||||
"""Return the ALTO namespace used in the given ElementTree.
|
"""Return the ALTO namespace used in the given ElementTree.
|
||||||
|
|
||||||
This relies on the assumption that, in any given ALTO file, the root element has the local name "alto". We do not
|
This relies on the assumption that, in any given ALTO file, the root element has the
|
||||||
check if the files uses any valid ALTO namespace.
|
local name "alto". We do not check if the files uses any valid ALTO namespace.
|
||||||
"""
|
"""
|
||||||
root_name = ET.QName(tree.getroot().tag)
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
if root_name.localname == "alto":
|
if root_name.localname == "alto":
|
||||||
|
@ -48,8 +49,9 @@ def alto_text(tree):
|
||||||
def page_namespace(tree):
|
def page_namespace(tree):
|
||||||
"""Return the PAGE content namespace used in the given ElementTree.
|
"""Return the PAGE content namespace used in the given ElementTree.
|
||||||
|
|
||||||
This relies on the assumption that, in any given PAGE content file, the root element has the local name "PcGts". We
|
This relies on the assumption that, in any given PAGE content file, the root element
|
||||||
do not check if the files uses any valid PAGE namespace.
|
has the local name "PcGts". We do not check if the files uses any valid PAGE
|
||||||
|
namespace.
|
||||||
"""
|
"""
|
||||||
root_name = ET.QName(tree.getroot().tag)
|
root_name = ET.QName(tree.getroot().tag)
|
||||||
if root_name.localname == "PcGts":
|
if root_name.localname == "PcGts":
|
||||||
|
@ -135,6 +137,10 @@ def page_text(tree, *, textequiv_level="region"):
|
||||||
return page_extract(tree, textequiv_level=textequiv_level).text
|
return page_extract(tree, textequiv_level=textequiv_level).text
|
||||||
|
|
||||||
|
|
||||||
|
def detect_encoding(filename):
|
||||||
|
return chardet.detect(open(filename, "rb").read(1024))["encoding"]
|
||||||
|
|
||||||
|
|
||||||
def plain_extract(filename, include_filename_in_id=False):
|
def plain_extract(filename, include_filename_in_id=False):
|
||||||
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
|
||||||
|
|
||||||
|
@ -149,7 +155,8 @@ def plain_extract(filename, include_filename_in_id=False):
|
||||||
clusters,
|
clusters,
|
||||||
)
|
)
|
||||||
|
|
||||||
with open(filename, "r") as f:
|
fileencoding = detect_encoding(filename)
|
||||||
|
with open(filename, "r", encoding=fileencoding) as f:
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
None,
|
None,
|
||||||
[make_segment(no, line) for no, line in enumerate(f.readlines())],
|
[make_segment(no, line) for no, line in enumerate(f.readlines())],
|
||||||
|
@ -171,7 +178,7 @@ def extract(filename, *, textequiv_level="region"):
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tree = ET.parse(filename)
|
tree = ET.parse(filename)
|
||||||
except XMLSyntaxError:
|
except (XMLSyntaxError, UnicodeDecodeError):
|
||||||
return plain_extract(filename)
|
return plain_extract(filename)
|
||||||
try:
|
try:
|
||||||
return page_extract(tree, textequiv_level=textequiv_level)
|
return page_extract(tree, textequiv_level=textequiv_level)
|
|
@ -1,4 +1,5 @@
|
||||||
{
|
{
|
||||||
|
"version": "0.9.4",
|
||||||
"git_url": "https://github.com/qurator-spk/dinglehopper",
|
"git_url": "https://github.com/qurator-spk/dinglehopper",
|
||||||
"tools": {
|
"tools": {
|
||||||
"ocrd-dinglehopper": {
|
"ocrd-dinglehopper": {
|
|
@ -4,7 +4,7 @@ import os
|
||||||
import click
|
import click
|
||||||
from ocrd import Processor
|
from ocrd import Processor
|
||||||
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
||||||
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality
|
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
from .cli import process as cli_process
|
from .cli import process as cli_process
|
|
@ -26,6 +26,22 @@
|
||||||
border: 2px solid;
|
border: 2px solid;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.row {
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
table {
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
th {
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
th:hover {
|
||||||
|
background-color: #eee;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
@ -50,6 +66,32 @@
|
||||||
<h2>Word differences</h2>
|
<h2>Word differences</h2>
|
||||||
{{ word_diff_report }}
|
{{ word_diff_report }}
|
||||||
|
|
||||||
|
{%- if differences %}
|
||||||
|
{% set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
{% for section in sections %}
|
||||||
|
<div class="col-md-6">
|
||||||
|
<h2>{{ section['title'] }}</h2>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>GT</th>
|
||||||
|
<th>OCR</th>
|
||||||
|
<th>Occurrences</th>
|
||||||
|
</tr>
|
||||||
|
{% for gt_ocr, occurrences in section['data'].items() %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ gt_ocr.split("::")[0] }}</td>
|
||||||
|
<td>{{ gt_ocr.split("::")[1] }}</td>
|
||||||
|
<td>{{ occurrences }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
39
src/dinglehopper/templates/report.html.js
Normal file
39
src/dinglehopper/templates/report.html.js
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
function find_diff_class(classes) {
|
||||||
|
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||||
|
}
|
||||||
|
|
||||||
|
$(document).ready(function() {
|
||||||
|
/* Enable Bootstrap tooltips */
|
||||||
|
$('[data-toggle="tooltip"]').tooltip();
|
||||||
|
|
||||||
|
$('.diff').mouseover(function() {
|
||||||
|
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||||
|
});
|
||||||
|
$('.diff').mouseout(function() {
|
||||||
|
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||||
|
});
|
||||||
|
|
||||||
|
/* Sort this column of the table */
|
||||||
|
$('th').click(function () {
|
||||||
|
var table = $(this).closest('table');
|
||||||
|
var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
|
||||||
|
this.asc = !this.asc;
|
||||||
|
if (!this.asc) {
|
||||||
|
rows = rows.reverse();
|
||||||
|
}
|
||||||
|
for (var i = 0; i < rows.length; i++) {
|
||||||
|
table.children('tbody').append(rows[i]);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
function compareRows(index) {
|
||||||
|
return function (row1, row2) {
|
||||||
|
var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
|
||||||
|
var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
|
||||||
|
return cell1.localeCompare(cell2, undefined, {
|
||||||
|
numeric: true,
|
||||||
|
sensitivity: 'base'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
|
@ -4,6 +4,12 @@
|
||||||
{% if metrics %}
|
{% if metrics %}
|
||||||
"cer": {{ cer|json_float }},
|
"cer": {{ cer|json_float }},
|
||||||
"wer": {{ wer|json_float }},
|
"wer": {{ wer|json_float }},
|
||||||
|
{% endif %}
|
||||||
|
{% if differences %}
|
||||||
|
"differences": {
|
||||||
|
"character_level": {{ diff_c|tojson }},
|
||||||
|
"word_level": {{ diff_w|tojson }}
|
||||||
|
},
|
||||||
{% endif %}
|
{% endif %}
|
||||||
"n_characters": {{ n_characters }},
|
"n_characters": {{ n_characters }},
|
||||||
"n_words": {{ n_words }}
|
"n_words": {{ n_words }}
|
136
src/dinglehopper/templates/summary.html.j2
Normal file
136
src/dinglehopper/templates/summary.html.j2
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||||
|
<style type="text/css">
|
||||||
|
{% if metrics %}
|
||||||
|
.gt .diff {
|
||||||
|
color: green;
|
||||||
|
}
|
||||||
|
.ocr .diff {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
{% else %}
|
||||||
|
.gt .diff, .ocr .diff {
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
{% endif %}
|
||||||
|
.ellipsis {
|
||||||
|
opacity: 0.5;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
.diff-highlight {
|
||||||
|
border: 2px solid;
|
||||||
|
border-radius: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.row {
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
table {
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.cer {
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
|
||||||
|
tr:hover {
|
||||||
|
background-color: #f5f5f5;
|
||||||
|
}
|
||||||
|
|
||||||
|
th {
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
th:hover {
|
||||||
|
background-color: #eee;
|
||||||
|
}
|
||||||
|
|
||||||
|
td {
|
||||||
|
min-width: 100px;
|
||||||
|
}
|
||||||
|
|
||||||
|
td:hover {
|
||||||
|
background-color: #eee;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<h1>Summary of all reports</h1>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<p>Number of reports: {{ num_reports }}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if cer_avg and wer_avg -%}
|
||||||
|
<div class="row">
|
||||||
|
<h2>Metrics</h2>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row cer">
|
||||||
|
<p>Average CER: {{ cer_avg|round(4) }}</p>
|
||||||
|
<p>Average WER: {{ wer_avg|round(4) }}</p>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{%- if diff_c and diff_w %}
|
||||||
|
{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
{%- for section in sections %}
|
||||||
|
<div class="col-md-6">
|
||||||
|
<h2>{{ section['title'] }}</h2>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
|
||||||
|
</thead>
|
||||||
|
{%- set num_omitted = namespace(value=0) -%}
|
||||||
|
{% for gt_ocr, occurrences in section['data'].items() -%}
|
||||||
|
{% if occurrences < occurrences_threshold -%}
|
||||||
|
{%- set num_omitted.value = num_omitted.value + 1 %}
|
||||||
|
{%- else -%}
|
||||||
|
{%- set gt = gt_ocr.split(" :: ")[0] %}
|
||||||
|
{%- set ocr = gt_ocr.split(" :: ")[1] %}
|
||||||
|
<tr>
|
||||||
|
<td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
|
||||||
|
<td title="{{ ocr|urlencode }}">{{ ocr }}</td >
|
||||||
|
<td>{{ occurrences }}</td>
|
||||||
|
</tr>
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
|
||||||
|
{% if num_omitted.value > 0 and occurrences_threshold > 1 -%}
|
||||||
|
<p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
|
||||||
|
{%- set num_omitted.value = 0 %}
|
||||||
|
{%- endif %}
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{%- endfor %}
|
||||||
|
</div>
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||||
|
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
{% include 'report.html.js' %}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
15
src/dinglehopper/templates/summary.json.j2
Normal file
15
src/dinglehopper/templates/summary.json.j2
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
{
|
||||||
|
"num_reports": {{ num_reports}}
|
||||||
|
{%- if cer_avg and wer_avg %}
|
||||||
|
,
|
||||||
|
"cer_avg": {{ cer_avg|json_float }},
|
||||||
|
"wer_avg": {{ wer_avg|json_float }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if diff_c and wer_avg %}
|
||||||
|
,
|
||||||
|
"differences": {
|
||||||
|
"character_level": {{ diff_c|tojson }},
|
||||||
|
"word_level": {{ diff_w|tojson }}
|
||||||
|
}
|
||||||
|
{%- endif %}
|
||||||
|
}
|
File diff suppressed because it is too large
Load diff
22865
src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
22865
src/dinglehopper/tests/data/bigger-texts/00008228/00008228.gt.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/2.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
3394
src/dinglehopper/tests/data/directory-test/ocr/3-has-no-gt.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test-fake-ocr.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test-gt.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
3394
src/dinglehopper/tests/data/test.page2018.xml
Normal file
3394
src/dinglehopper/tests/data/test.page2018.xml
Normal file
File diff suppressed because it is too large
Load diff
Before Width: | Height: | Size: 426 KiB After Width: | Height: | Size: 426 KiB |
|
@ -6,7 +6,7 @@ import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .. import seq_align, ExtractedText
|
from .. import ExtractedText, seq_align
|
||||||
|
|
||||||
|
|
||||||
def test_text():
|
def test_text():
|
||||||
|
@ -30,12 +30,20 @@ def test_text():
|
||||||
|
|
||||||
def test_normalization_check():
|
def test_normalization_check():
|
||||||
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
|
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
|
||||||
ExtractedText("foo", None, None,
|
ExtractedText(
|
||||||
|
"foo",
|
||||||
|
None,
|
||||||
|
None,
|
||||||
unicodedata.normalize("NFD", "Schlyñ"),
|
unicodedata.normalize("NFD", "Schlyñ"),
|
||||||
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")))
|
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
|
||||||
assert ExtractedText("foo", None, None,
|
)
|
||||||
|
assert ExtractedText(
|
||||||
|
"foo",
|
||||||
|
None,
|
||||||
|
None,
|
||||||
unicodedata.normalize("NFC", "Schlyñ"),
|
unicodedata.normalize("NFC", "Schlyñ"),
|
||||||
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")))
|
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
|
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
|
|
@ -1,7 +1,9 @@
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from .. import align, distance, score_hint, seq_align
|
||||||
from .util import unzip
|
from .util import unzip
|
||||||
from .. import align, seq_align, distance, score_hint
|
|
||||||
|
|
||||||
|
|
||||||
def test_left_empty():
|
def test_left_empty():
|
||||||
|
@ -72,7 +74,8 @@ def test_with_some_fake_ocr_errors():
|
||||||
result = list(
|
result = list(
|
||||||
align(
|
align(
|
||||||
"Über die vielen Sorgen wegen desselben vergaß",
|
"Über die vielen Sorgen wegen desselben vergaß",
|
||||||
"SomeJunk MoreJunk Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
"SomeJunk MoreJunk "
|
||||||
|
+ "Übey die vielen Sorgen wegen AdditionalJunk deffelben vcrgab",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
left, right = unzip(result)
|
left, right = unzip(result)
|
||||||
|
@ -183,6 +186,7 @@ def test_lines_similar():
|
||||||
# Test __eq__ (i.e. is it a substitution or a similar string?)
|
# Test __eq__ (i.e. is it a substitution or a similar string?)
|
||||||
assert list(left)[0] == list(right)[0]
|
assert list(left)[0] == list(right)[0]
|
||||||
|
|
||||||
|
|
||||||
def test_score_hint():
|
def test_score_hint():
|
||||||
assert score_hint(0.5, 23) == 12 # int(ceil())
|
assert score_hint(0.5, 23) == 12 # int(ceil())
|
||||||
assert score_hint(math.inf, 12345) is None
|
assert score_hint(math.inf, 12345) is None
|
|
@ -36,6 +36,7 @@ def test_character_error_rate_hard():
|
||||||
len(s2) == 7
|
len(s2) == 7
|
||||||
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
|
||||||
|
|
||||||
# Both strings have the same length in terms of grapheme clusters. So the CER should be symmetrical.
|
# Both strings have the same length in terms of grapheme clusters. So the CER should
|
||||||
|
# be symmetrical.
|
||||||
assert character_error_rate(s2, s1) == 1 / 6
|
assert character_error_rate(s2, s1) == 1 / 6
|
||||||
assert character_error_rate(s1, s2) == 1 / 6
|
assert character_error_rate(s1, s2) == 1 / 6
|
|
@ -15,7 +15,9 @@ def test_align_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
# → 2 elements in the alignment should be different, the ligature is
|
# → 2 elements in the alignment should be different, the ligature is
|
||||||
# (currently) not counted due to normalization.
|
# (currently) not counted due to normalization.
|
||||||
# NOTE: In this example, it doesn't matter that we work with "characters", not grapheme clusters.
|
#
|
||||||
|
# NOTE: In this example, it doesn't matter that we work with "characters", not
|
||||||
|
# grapheme clusters.
|
||||||
|
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||||
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
|
28
src/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
28
src/dinglehopper/tests/test_integ_bigger_texts.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from __future__ import division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from .. import alto_text, character_error_rate, page_text
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_bigger_texts():
|
||||||
|
gt = page_text(
|
||||||
|
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
|
||||||
|
)
|
||||||
|
ocr = alto_text(
|
||||||
|
ET.parse(
|
||||||
|
os.path.join(
|
||||||
|
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only interested in a result here: In earlier versions this would have used
|
||||||
|
# tens of GB of RAM and should now not break a sweat.
|
||||||
|
assert character_error_rate(gt, ocr) >= 0.0
|
|
@ -6,7 +6,7 @@ import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from uniseg.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .. import character_error_rate, page_text, alto_text
|
from .. import alto_text, character_error_rate, page_text
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
53
src/dinglehopper/tests/test_integ_cli_dir.py
Normal file
53
src/dinglehopper/tests/test_integ_cli_dir.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from ocrd_utils import initLogging
|
||||||
|
|
||||||
|
from dinglehopper.cli import process_dir
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_cli_directory(tmp_path):
|
||||||
|
"""
|
||||||
|
Test that the cli/process_dir() processes a directory of files and
|
||||||
|
yields JSON and HTML reports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
initLogging()
|
||||||
|
process_dir(
|
||||||
|
os.path.join(data_dir, "directory-test", "gt"),
|
||||||
|
os.path.join(data_dir, "directory-test", "ocr"),
|
||||||
|
"report",
|
||||||
|
str(tmp_path / "reports"),
|
||||||
|
False,
|
||||||
|
True,
|
||||||
|
"line",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
|
||||||
|
assert os.path.exists(tmp_path / "reports/1.xml-report.html")
|
||||||
|
assert os.path.exists(tmp_path / "reports/2.xml-report.json")
|
||||||
|
assert os.path.exists(tmp_path / "reports/2.xml-report.html")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_cli_fail_without_gt(tmp_path):
|
||||||
|
"""
|
||||||
|
Test that the cli/process_dir skips a file if there is no corresponding file
|
||||||
|
in the other directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
initLogging()
|
||||||
|
process_dir(
|
||||||
|
os.path.join(data_dir, "directory-test", "gt"),
|
||||||
|
os.path.join(data_dir, "directory-test", "ocr"),
|
||||||
|
"report",
|
||||||
|
str(tmp_path / "reports"),
|
||||||
|
False,
|
||||||
|
True,
|
||||||
|
"line",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(os.listdir(tmp_path / "reports")) == 2 * 2
|
|
@ -1,9 +1,9 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from .util import working_directory
|
|
||||||
|
|
||||||
from ..cli import process
|
from ..cli import process
|
||||||
|
from .util import working_directory
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
37
src/dinglehopper/tests/test_integ_differences.py
Normal file
37
src/dinglehopper/tests/test_integ_differences.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from ocrd_utils import initLogging
|
||||||
|
|
||||||
|
from dinglehopper.cli import process
|
||||||
|
|
||||||
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_cli_differences(tmp_path):
|
||||||
|
"""Test that the cli/process() yields a JSON report that includes
|
||||||
|
the differences found between the GT and OCR text"""
|
||||||
|
|
||||||
|
initLogging()
|
||||||
|
process(
|
||||||
|
os.path.join(data_dir, "test-gt.page2018.xml"),
|
||||||
|
os.path.join(data_dir, "test-fake-ocr.page2018.xml"),
|
||||||
|
"report",
|
||||||
|
tmp_path,
|
||||||
|
differences=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert os.path.exists(tmp_path / "report.json")
|
||||||
|
|
||||||
|
with open(tmp_path / "report.json", "r") as jsonf:
|
||||||
|
j = json.load(jsonf)
|
||||||
|
|
||||||
|
assert j["differences"] == {
|
||||||
|
"character_level": {"n :: m": 1, "ſ :: f": 1},
|
||||||
|
"word_level": {
|
||||||
|
"Augenblick :: Augemblick": 1,
|
||||||
|
"Verſprochene :: Verfprochene": 1,
|
||||||
|
},
|
||||||
|
}
|
|
@ -5,7 +5,7 @@ import os
|
||||||
import pytest
|
import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
|
||||||
from .. import distance, page_text, alto_text
|
from .. import alto_text, distance, page_text
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
|
@ -1,21 +1,20 @@
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import json
|
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from click.testing import CliRunner
|
from click.testing import CliRunner
|
||||||
from .util import working_directory
|
|
||||||
|
|
||||||
|
|
||||||
from ..ocrd_cli import ocrd_dinglehopper
|
from ..ocrd_cli import ocrd_dinglehopper
|
||||||
|
from .util import working_directory
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix")
|
@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
|
||||||
def test_ocrd_cli(tmp_path):
|
def test_ocrd_cli(tmp_path):
|
||||||
"""Test OCR-D interface"""
|
"""Test OCR-D interface"""
|
||||||
|
|
110
src/dinglehopper/tests/test_integ_summarize.py
Normal file
110
src/dinglehopper/tests/test_integ_summarize.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from .. import cli_summarize
|
||||||
|
from .util import working_directory
|
||||||
|
|
||||||
|
expected_cer_avg = (0.05 + 0.10) / 2
|
||||||
|
expected_wer_avg = (0.15 + 0.20) / 2
|
||||||
|
expected_diff_c = {"a": 30, "b": 50}
|
||||||
|
expected_diff_w = {"c": 70, "d": 90}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def create_summaries(tmp_path):
|
||||||
|
"""Create two summary reports with mock data"""
|
||||||
|
reports_dirname = tmp_path / "reports"
|
||||||
|
reports_dirname.mkdir()
|
||||||
|
|
||||||
|
report1 = {
|
||||||
|
"cer": 0.05,
|
||||||
|
"wer": 0.15,
|
||||||
|
"differences": {
|
||||||
|
"character_level": {"a": 10, "b": 20},
|
||||||
|
"word_level": {"c": 30, "d": 40},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
report2 = {
|
||||||
|
"cer": 0.10,
|
||||||
|
"wer": 0.20,
|
||||||
|
"differences": {
|
||||||
|
"character_level": {"a": 20, "b": 30},
|
||||||
|
"word_level": {"c": 40, "d": 50},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
|
||||||
|
json.dump(report1, f)
|
||||||
|
with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
|
||||||
|
json.dump(report2, f)
|
||||||
|
|
||||||
|
return str(reports_dirname)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_cli_summarize_json(tmp_path, create_summaries):
|
||||||
|
"""Test that the cli/process() yields a summarized JSON report"""
|
||||||
|
with working_directory(tmp_path):
|
||||||
|
reports_dirname = create_summaries
|
||||||
|
cli_summarize.process(reports_dirname)
|
||||||
|
|
||||||
|
with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
|
||||||
|
summary_data = json.load(f)
|
||||||
|
|
||||||
|
assert summary_data["num_reports"] == 2
|
||||||
|
assert summary_data["cer_avg"] == expected_cer_avg
|
||||||
|
assert summary_data["wer_avg"] == expected_wer_avg
|
||||||
|
assert summary_data["differences"]["character_level"] == expected_diff_c
|
||||||
|
assert summary_data["differences"]["word_level"] == expected_diff_w
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_cli_summarize_html(tmp_path, create_summaries):
|
||||||
|
"""Test that the cli/process() yields an HTML report"""
|
||||||
|
with working_directory(tmp_path):
|
||||||
|
reports_dirname = create_summaries
|
||||||
|
cli_summarize.process(reports_dirname)
|
||||||
|
|
||||||
|
html_file = os.path.join(reports_dirname, "summary.html")
|
||||||
|
assert os.path.isfile(html_file)
|
||||||
|
|
||||||
|
with open(html_file, "r") as f:
|
||||||
|
contents = f.read()
|
||||||
|
|
||||||
|
assert len(contents) > 0
|
||||||
|
assert "Number of reports: 2" in contents
|
||||||
|
assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
|
||||||
|
assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
|
||||||
|
"""
|
||||||
|
Test that the cli/process() does not include reports that are missing a WER value.
|
||||||
|
"""
|
||||||
|
with working_directory(tmp_path):
|
||||||
|
reports_dirname = create_summaries
|
||||||
|
|
||||||
|
# This third report has no WER value and should not be included in the summary
|
||||||
|
report3 = {
|
||||||
|
"cer": 0.10,
|
||||||
|
"differences": {
|
||||||
|
"character_level": {"a": 20, "b": 30},
|
||||||
|
"word_level": {"c": 40, "d": 50},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
|
||||||
|
json.dump(report3, f)
|
||||||
|
|
||||||
|
cli_summarize.process(reports_dirname)
|
||||||
|
|
||||||
|
html_file = os.path.join(reports_dirname, "summary.html")
|
||||||
|
assert os.path.isfile(html_file)
|
||||||
|
|
||||||
|
with open(html_file, "r") as f:
|
||||||
|
contents = f.read()
|
||||||
|
|
||||||
|
assert "Number of reports: 2" in contents # report3 is not included
|
|
@ -5,15 +5,15 @@ import os
|
||||||
import pytest
|
import pytest
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
|
||||||
from .. import word_error_rate, words, page_text, alto_text
|
from .. import alto_text, page_text, word_error_rate, words
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_word_error_rate_between_page_files():
|
def test_word_error_rate_between_page_files():
|
||||||
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. So we have 3 changed words,
|
# In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
|
||||||
# the ligature does not count → 2 errors
|
# So we have 3 changed words, the ligature does not count → 2 errors
|
||||||
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
|
||||||
|
|
||||||
gt_word_count = (
|
gt_word_count = (
|
|
@ -1,13 +1,11 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import lxml.etree as ET
|
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
import pytest
|
import lxml.etree as ET
|
||||||
|
|
||||||
from .util import working_directory
|
|
||||||
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
from .. import alto_namespace, alto_text, page_namespace, page_text, plain_text, text
|
||||||
|
from .util import working_directory
|
||||||
|
|
||||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||||
|
|
||||||
|
@ -161,7 +159,8 @@ def test_page_level():
|
||||||
result = page_text(tree, textequiv_level="line")
|
result = page_text(tree, textequiv_level="line")
|
||||||
assert (
|
assert (
|
||||||
result
|
result
|
||||||
== "Hand, Mylord? fragte der Graf von Rocheſter.\nAls er einsmals in dem Oberhauſe eine Bill we-"
|
== "Hand, Mylord? fragte der Graf von Rocheſter.\n"
|
||||||
|
+ "Als er einsmals in dem Oberhauſe eine Bill we-"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,8 @@ def test_words():
|
||||||
def test_words_private_use_area():
|
def test_words_private_use_area():
|
||||||
result = list(
|
result = list(
|
||||||
words(
|
words(
|
||||||
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, der Frau Amtmnnin das ver⸗\n"
|
"ber die vielen Sorgen wegen deelben vergaß Hartkopf, "
|
||||||
|
"der Frau Amtmnnin das ver⸗\n"
|
||||||
"ſproene zu berliefern."
|
"ſproene zu berliefern."
|
||||||
)
|
)
|
||||||
)
|
)
|
|
@ -1,8 +1,8 @@
|
||||||
|
import os
|
||||||
from itertools import zip_longest
|
from itertools import zip_longest
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import colorama
|
import colorama
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def diffprint(x, y):
|
def diffprint(x, y):
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue