Merge branch 'master' into performance
commit
38fcbc8e1c
@ -1,23 +0,0 @@
|
||||
version: 2.1
|
||||
|
||||
jobs:
|
||||
test:
|
||||
parameters:
|
||||
python-version:
|
||||
type: string
|
||||
docker:
|
||||
- image: cimg/python:<< parameters.python-version >>
|
||||
steps:
|
||||
- checkout
|
||||
- run: pip3 install --upgrade pip
|
||||
- run: pip3 install -r requirements.txt
|
||||
- run: pip3 install pytest
|
||||
- run: pytest
|
||||
|
||||
workflows:
|
||||
all-tests:
|
||||
jobs:
|
||||
- test:
|
||||
matrix:
|
||||
parameters:
|
||||
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
# We call setuptools.setup() here as we may rely on setuptools to interpret
|
||||
# a dynamic version field. (Reading pyproject.toml is not enough in that case.)
|
||||
expected_git_tag="v$(python -c 'from setuptools import setup; setup()' --version)"
|
||||
actual_git_tag="$(git describe --tags)"
|
||||
|
||||
if [[ "$expected_git_tag" == "$actual_git_tag" ]]; then
|
||||
echo "OK: Python package version $expected_git_tag matches git tag"
|
||||
exit 0
|
||||
else
|
||||
echo "ERROR: Python package version $expected_git_tag does NOT match git tag $actual_git_tag"
|
||||
exit 1
|
||||
fi
|
@ -0,0 +1,69 @@
|
||||
name: release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*.*.*"
|
||||
|
||||
env:
|
||||
PYPI_URL: https://pypi.org/p/dinglehopper
|
||||
|
||||
jobs:
|
||||
test:
|
||||
uses: ./.github/workflows/test.yml
|
||||
|
||||
build:
|
||||
needs: test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Upgrade pip
|
||||
run: python3 -m pip install --upgrade pip
|
||||
- name: Install setuptools
|
||||
run: |
|
||||
python3 -m pip install --upgrade setuptools
|
||||
# For OCR-D tools, we need setuptools-ocrd to get the version
|
||||
if [ -e ocrd-tool.json ]; then
|
||||
python3 -m pip install setuptools-ocrd
|
||||
fi
|
||||
- name: Check git tag vs package version
|
||||
run: .github/workflows/release-check-version-tag
|
||||
- name: Build package
|
||||
run: python3 -m pip install --upgrade build && python3 -m build
|
||||
- name: Upload dist
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
||||
|
||||
github-release:
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Download dist
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
||||
- name: Create release on GitHub
|
||||
uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
files: dist/*
|
||||
|
||||
pypi-publish:
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: ${{ env.PYPI_URL }}
|
||||
permissions:
|
||||
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
||||
steps:
|
||||
- name: Download dist
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
@ -0,0 +1,76 @@
|
||||
name: test
|
||||
|
||||
on:
|
||||
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
|
||||
schedule:
|
||||
- cron: "00 16 07 * *" # = monthly
|
||||
|
||||
# Allow manually running (from GitHub Web)
|
||||
workflow_dispatch:
|
||||
|
||||
# Allow calling this workflow (e.g. from release workflow)
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12" ]
|
||||
|
||||
# For Python 3.6, we need to fall back to Ubuntu 20.04
|
||||
runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
|
||||
|
||||
env:
|
||||
test_results_dir: test-results-${{ matrix.python-version }}
|
||||
|
||||
steps:
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Update pip
|
||||
run: python3 -m pip install -U pip
|
||||
- name: Avoid compiling OpenCV and NumPy on Python 3.6
|
||||
run: |
|
||||
if python3 --version | grep -q "Python 3.6"; then
|
||||
pip install --prefer-binary -U opencv-python-headless numpy
|
||||
fi
|
||||
- name: Install requirements*.txt
|
||||
run: |
|
||||
for requirements_txt in requirements*.txt; do
|
||||
python3 -m pip install -r $requirements_txt;
|
||||
done
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd src
|
||||
mkdir -p ../$test_results_dir
|
||||
python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
|
||||
- name: Upload test results
|
||||
uses: actions/upload-artifact@v3
|
||||
if: success() || failure()
|
||||
with:
|
||||
name: ${{ env.test_results_dir }}
|
||||
path: ${{ env.test_results_dir }}
|
||||
|
||||
- name: Report tests
|
||||
uses: dorny/test-reporter@v1
|
||||
if: success() || failure()
|
||||
with:
|
||||
name: Results on Python ${{ matrix.python-version }}
|
||||
path: "${{env.test_results_dir }}/junit.xml"
|
||||
reporter: java-junit
|
@ -0,0 +1,36 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.5.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-json
|
||||
- id: check-toml
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
- id: check-ast
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.1.1
|
||||
hooks:
|
||||
- args:
|
||||
- --fix
|
||||
- --exit-non-zero-on-fix
|
||||
id: ruff
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.6.1
|
||||
hooks:
|
||||
- additional_dependencies:
|
||||
- types-setuptools
|
||||
id: mypy
|
||||
|
||||
- repo: https://gitlab.com/vojko.pribudic/pre-commit-update
|
||||
rev: v0.1.0
|
||||
hooks:
|
||||
- id: pre-commit-update
|
@ -1 +1 @@
|
||||
qurator/dinglehopper/ocrd-tool.json
|
||||
src/dinglehopper/ocrd-tool.json
|
@ -0,0 +1,70 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
|
||||
|
||||
[project]
|
||||
name = "dinglehopper"
|
||||
authors = [
|
||||
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
|
||||
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
|
||||
]
|
||||
description = "The OCR evaluation tool"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.6"
|
||||
keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
|
||||
|
||||
dynamic = ["version", "dependencies", "optional-dependencies"]
|
||||
|
||||
# https://pypi.org/classifiers/
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: Other Audience",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Topic :: Scientific/Engineering :: Information Analysis",
|
||||
"Topic :: Text Processing",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
dinglehopper = "dinglehopper.cli:main"
|
||||
dinglehopper-line-dirs = "dinglehopper.cli_line_dirs:main"
|
||||
dinglehopper-extract = "dinglehopper.cli_extract:main"
|
||||
dinglehopper-summarize = "dinglehopper.cli_summarize:main"
|
||||
ocrd-dinglehopper = "dinglehopper.ocrd_cli:ocrd_dinglehopper"
|
||||
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/qurator-spk/dinglehopper"
|
||||
Repository = "https://github.com/qurator-spk/dinglehopper.git"
|
||||
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
dependencies = {file = ["requirements.txt"]}
|
||||
optional-dependencies.dev = {file = ["requirements-dev.txt"]}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
dinglehopper = ["templates/*"]
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = 6.0
|
||||
addopts = "--strict-markers"
|
||||
markers = [
|
||||
"integration: integration tests",
|
||||
]
|
||||
|
||||
|
||||
[tool.mypy]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
[tool.ruff]
|
||||
select = ["E", "F", "I"]
|
||||
ignore = [
|
||||
"F811", # multimethods are considered redefinitions by ruff
|
||||
]
|
@ -1,4 +0,0 @@
|
||||
[pytest]
|
||||
markers =
|
||||
integration: integration tests
|
||||
serial
|
@ -1 +0,0 @@
|
||||
__import__("pkg_resources").declare_namespace(__name__)
|
@ -1,5 +0,0 @@
|
||||
from .ocr_files import *
|
||||
from .extracted_text import *
|
||||
from .character_error_rate import *
|
||||
from .word_error_rate import *
|
||||
from .align import *
|
@ -1,15 +0,0 @@
|
||||
function find_diff_class(classes) {
|
||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
/* Enable Bootstrap tooltips */
|
||||
$('[data-toggle="tooltip"]').tooltip();
|
||||
|
||||
$('.diff').mouseover(function() {
|
||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
});
|
@ -1,5 +1,8 @@
|
||||
pytest
|
||||
pytest-flake8
|
||||
pytest-cov
|
||||
pytest-mypy
|
||||
black
|
||||
pre-commit
|
||||
|
||||
ruff ; python_version >= "3.7"
|
||||
pytest-ruff ; python_version >= "3.7"
|
||||
|
@ -1,12 +0,0 @@
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203, W503
|
||||
|
||||
[pylint]
|
||||
max-line-length = 88
|
||||
|
||||
[pylint.messages_control]
|
||||
disable = C0330, C0326
|
||||
|
||||
[mypy]
|
||||
ignore_missing_imports = True
|
@ -1,34 +0,0 @@
|
||||
from io import open
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
with open("requirements.txt") as fp:
|
||||
install_requires = fp.read()
|
||||
|
||||
with open('requirements-dev.txt') as fp:
|
||||
tests_require = fp.read()
|
||||
|
||||
setup(
|
||||
name="dinglehopper",
|
||||
author="Mike Gerber, The QURATOR SPK Team",
|
||||
author_email="mike.gerber@sbb.spk-berlin.de, qurator@sbb.spk-berlin.de",
|
||||
description="The OCR evaluation tool",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
keywords="qurator ocr",
|
||||
license="Apache",
|
||||
namespace_packages=["qurator"],
|
||||
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
||||
install_requires=install_requires,
|
||||
tests_require=tests_require,
|
||||
package_data={
|
||||
"": ["*.json", "templates/*"],
|
||||
},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"dinglehopper=qurator.dinglehopper.cli:main",
|
||||
"dinglehopper-line-dirs=qurator.dinglehopper.cli_line_dirs:main",
|
||||
"dinglehopper-extract=qurator.dinglehopper.cli_extract:main",
|
||||
"ocrd-dinglehopper=qurator.dinglehopper.ocrd_cli:ocrd_dinglehopper",
|
||||
]
|
||||
},
|
||||
)
|
@ -0,0 +1,33 @@
|
||||
from .align import align, score_hint, seq_align
|
||||
from .character_error_rate import character_error_rate, character_error_rate_n
|
||||
from .edit_distance import distance, editops
|
||||
from .extracted_text import ExtractedText
|
||||
from .ocr_files import (
|
||||
alto_namespace,
|
||||
alto_text,
|
||||
page_namespace,
|
||||
page_text,
|
||||
plain_text,
|
||||
text,
|
||||
)
|
||||
from .word_error_rate import word_error_rate, word_error_rate_n, words
|
||||
|
||||
__all__ = [
|
||||
"editops",
|
||||
"distance",
|
||||
"align",
|
||||
"score_hint",
|
||||
"seq_align",
|
||||
"character_error_rate",
|
||||
"character_error_rate_n",
|
||||
"word_error_rate",
|
||||
"word_error_rate_n",
|
||||
"words",
|
||||
"ExtractedText",
|
||||
"alto_namespace",
|
||||
"alto_text",
|
||||
"page_namespace",
|
||||
"page_text",
|
||||
"plain_text",
|
||||
"text",
|
||||
]
|
@ -1,9 +1,12 @@
|
||||
import math
|
||||
import unicodedata
|
||||
from math import ceil
|
||||
|
||||
from .edit_distance import *
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
from .edit_distance import grapheme_clusters
|
||||
|
||||
|
||||
def align(t1, t2):
|
||||
"""Align text."""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
|
@ -0,0 +1,106 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import click
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from dinglehopper.cli import json_float
|
||||
|
||||
|
||||
def process(reports_folder, occurrences_threshold=1):
|
||||
cer_list = []
|
||||
wer_list = []
|
||||
cer_sum = 0
|
||||
wer_sum = 0
|
||||
diff_c = {}
|
||||
diff_w = {}
|
||||
|
||||
for report in os.listdir(reports_folder):
|
||||
if report.endswith(".json"):
|
||||
with open(os.path.join(reports_folder, report), "r") as f:
|
||||
report_data = json.load(f)
|
||||
|
||||
if "cer" not in report_data or "wer" not in report_data:
|
||||
click.echo(
|
||||
f"Skipping {report} because it does not contain CER and WER"
|
||||
)
|
||||
continue
|
||||
|
||||
cer = report_data["cer"]
|
||||
wer = report_data["wer"]
|
||||
cer_list.append(cer)
|
||||
wer_list.append(wer)
|
||||
cer_sum += cer
|
||||
wer_sum += wer
|
||||
|
||||
try:
|
||||
for key, value in report_data["differences"][
|
||||
"character_level"
|
||||
].items():
|
||||
diff_c[key] = diff_c.get(key, 0) + value
|
||||
for key, value in report_data["differences"]["word_level"].items():
|
||||
diff_w[key] = diff_w.get(key, 0) + value
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
if len(cer_list) == 0:
|
||||
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
|
||||
return
|
||||
|
||||
cer_avg = cer_sum / len(cer_list)
|
||||
wer_avg = wer_sum / len(wer_list)
|
||||
|
||||
print(f"Number of reports: {len(cer_list)}")
|
||||
print(f"Average CER: {cer_avg}")
|
||||
print(f"Average WER: {wer_avg}")
|
||||
print(f"Sum of common mistakes: {cer_sum}")
|
||||
print(f"Sum of common mistakes: {wer_sum}")
|
||||
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(
|
||||
os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
|
||||
)
|
||||
)
|
||||
env.filters["json_float"] = json_float
|
||||
for report_suffix in (".html", ".json"):
|
||||
template_fn = "summary" + report_suffix + ".j2"
|
||||
|
||||
out_fn = os.path.join(reports_folder, "summary" + report_suffix)
|
||||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
num_reports=len(cer_list),
|
||||
cer_avg=cer_avg,
|
||||
wer_avg=wer_avg,
|
||||
diff_c=diff_c,
|
||||
diff_w=diff_w,
|
||||
occurrences_threshold=occurrences_threshold,
|
||||
).dump(out_fn)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("reports_folder", type=click.Path(exists=True), default="./reports")
|
||||
@click.option(
|
||||
"--occurrences-threshold",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Only show differences that occur at least this many times.",
|
||||
)
|
||||
def main(reports_folder, occurrences_threshold):
|
||||
"""
|
||||
Summarize the results from multiple reports generated earlier by dinglehopper.
|
||||
It calculates the average CER and WER, as well as a sum of common mistakes.
|
||||
Reports include lists of mistakes and their occurrences.
|
||||
|
||||
You may use a threshold to reduce the file size of the HTML report by only showing
|
||||
mistakes whose number of occurrences is above the threshold. The JSON report will
|
||||
always contain all mistakes.
|
||||
|
||||
All JSON files in the provided folder will be gathered and summarized.
|
||||
"""
|
||||
initLogging()
|
||||
process(reports_folder, occurrences_threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,8 +1,8 @@
|
||||
import unicodedata
|
||||
|
||||
from multimethod import multimethod
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
|
@ -1,4 +1,5 @@
|
||||
{
|
||||
"version": "0.9.4",
|
||||
"git_url": "https://github.com/qurator-spk/dinglehopper",
|
||||
"tools": {
|
||||
"ocrd-dinglehopper": {
|
@ -0,0 +1,39 @@
|
||||
function find_diff_class(classes) {
|
||||
return $('.' + classes.split(/\s+/).find(x => x.match(/.diff\d.*/)));
|
||||
}
|
||||
|
||||
$(document).ready(function() {
|
||||
/* Enable Bootstrap tooltips */
|
||||
$('[data-toggle="tooltip"]').tooltip();
|
||||
|
||||
$('.diff').mouseover(function() {
|
||||
find_diff_class($(this).attr('class')).addClass('diff-highlight');
|
||||
});
|
||||
$('.diff').mouseout(function() {
|
||||
find_diff_class($(this).attr('class')).removeClass('diff-highlight');
|
||||
});
|
||||
|
||||
/* Sort this column of the table */
|
||||
$('th').click(function () {
|
||||
var table = $(this).closest('table');
|
||||
var rows = table.find('tbody > tr').toArray().sort(compareRows($(this).index()));
|
||||
this.asc = !this.asc;
|
||||
if (!this.asc) {
|
||||
rows = rows.reverse();
|
||||
}
|
||||
for (var i = 0; i < rows.length; i++) {
|
||||
table.children('tbody').append(rows[i]);
|
||||
}
|
||||
});
|
||||
|
||||
function compareRows(index) {
|
||||
return function (row1, row2) {
|
||||
var cell1 = $(row1).children('td').eq(index).text().toLowerCase();
|
||||
var cell2 = $(row2).children('td').eq(index).text().toLowerCase();
|
||||
return cell1.localeCompare(cell2, undefined, {
|
||||
numeric: true,
|
||||
sensitivity: 'base'
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
@ -0,0 +1,136 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style type="text/css">
|
||||
{% if metrics %}
|
||||
.gt .diff {
|
||||
color: green;
|
||||
}
|
||||
.ocr .diff {
|
||||
color: red;
|
||||
}
|
||||
{% else %}
|
||||
.gt .diff, .ocr .diff {
|
||||
color: blue;
|
||||
}
|
||||
{% endif %}
|
||||
.ellipsis {
|
||||
opacity: 0.5;
|
||||
font-style: italic;
|
||||
}
|
||||
.diff-highlight {
|
||||
border: 2px solid;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.row {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.cer {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
tr:hover {
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
th {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
th:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
|
||||
td {
|
||||
min-width: 100px;
|
||||
}
|
||||
|
||||
td:hover {
|
||||
background-color: #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="container">
|
||||
|
||||
<div class="row">
|
||||
<h1>Summary of all reports</h1>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<p>Number of reports: {{ num_reports }}</p>
|
||||
</div>
|
||||
|
||||
{% if cer_avg and wer_avg -%}
|
||||
<div class="row">
|
||||
<h2>Metrics</h2>
|
||||
</div>
|
||||
|
||||
<div class="row cer">
|
||||
<p>Average CER: {{ cer_avg|round(4) }}</p>
|
||||
<p>Average WER: {{ wer_avg|round(4) }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{%- if diff_c and diff_w %}
|
||||
{%- set sections = [{'title': 'Found differences (character)', 'data': diff_c}, {'title': 'Found differences (word)', 'data': diff_w}] %}
|
||||
|
||||
<div class="row">
|
||||
{%- for section in sections %}
|
||||
<div class="col-md-6">
|
||||
<h2>{{ section['title'] }}</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>GT</th><th>OCR</th><th>Occurrences</th></tr>
|
||||
</thead>
|
||||
{%- set num_omitted = namespace(value=0) -%}
|
||||
{% for gt_ocr, occurrences in section['data'].items() -%}
|
||||
{% if occurrences < occurrences_threshold -%}
|
||||
{%- set num_omitted.value = num_omitted.value + 1 %}
|
||||
{%- else -%}
|
||||
{%- set gt = gt_ocr.split(" :: ")[0] %}
|
||||
{%- set ocr = gt_ocr.split(" :: ")[1] %}
|
||||
<tr>
|
||||
<td title="{{ gt|urlencode }}">{{ gt }}</td>{# display the unicode character #}
|
||||
<td title="{{ ocr|urlencode }}">{{ ocr }}</td >
|
||||
<td>{{ occurrences }}</td>
|
||||
</tr>
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{% if num_omitted.value > 0 and occurrences_threshold > 1 -%}
|
||||
<p>Skipped {{ num_omitted.value }} diffs with fewer than {{ occurrences_threshold }} occurrences. The complete list of diffs is available in the accompanying JSON file.</p>
|
||||
{%- set num_omitted.value = 0 %}
|
||||
{%- endif %}
|
||||
</table>
|
||||
</div>
|
||||
{%- endfor %}
|
||||
</div>
|
||||
{%- endif %}
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
{% include 'report.html.js' %}
|
||||
</script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,15 @@
|
||||
{
|
||||
"num_reports": {{ num_reports}}
|
||||
{%- if cer_avg and wer_avg %}
|
||||
,
|
||||
"cer_avg": {{ cer_avg|json_float }},
|
||||
"wer_avg": {{ wer_avg|json_float }}
|
||||
{%- endif %}
|
||||
{%- if diff_c and wer_avg %}
|
||||
,
|
||||
"differences": {
|
||||
"character_level": {{ diff_c|tojson }},
|
||||
"word_level": {{ diff_w|tojson }}
|
||||
}
|
||||
{%- endif %}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
||||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
|
Before Width: | Height: | Size: 426 KiB After Width: | Height: | Size: 426 KiB |
@ -0,0 +1,28 @@
|
||||
from __future__ import division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from .. import alto_text, character_error_rate, page_text
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bigger_texts():
|
||||
gt = page_text(
|
||||
ET.parse(os.path.join(data_dir, "bigger-texts", "00008228", "00008228.gt.xml"))
|
||||
)
|
||||
ocr = alto_text(
|
||||
ET.parse(
|
||||
os.path.join(
|
||||
data_dir, "bigger-texts", "00008228", "00008228-00236534.gt4hist.xml"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Only interested in a result here: In earlier versions this would have used
|
||||
# tens of GB of RAM and should now not break a sweat.
|
||||
assert character_error_rate(gt, ocr) >= 0.0
|
@ -0,0 +1,53 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from ocrd_utils import initLogging
|
||||
|
||||
from dinglehopper.cli import process_dir
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_directory(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir() processes a directory of files and
|
||||
yields JSON and HTML reports.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(
|
||||
os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report",
|
||||
str(tmp_path / "reports"),
|
||||
False,
|
||||
True,
|
||||
"line",
|
||||
)
|
||||
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/1.xml-report.html")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.json")
|
||||
assert os.path.exists(tmp_path / "reports/2.xml-report.html")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_fail_without_gt(tmp_path):
|
||||
"""
|
||||
Test that the cli/process_dir skips a file if there is no corresponding file
|
||||
in the other directory.
|
||||
"""
|
||||
|
||||
initLogging()
|
||||
process_dir(
|
||||
os.path.join(data_dir, "directory-test", "gt"),
|
||||
os.path.join(data_dir, "directory-test", "ocr"),
|
||||
"report",
|
||||
str(tmp_path / "reports"),
|
||||
False,
|
||||
True,
|
||||
"line",
|
||||
)
|
||||
|
||||
assert len(os.listdir(tmp_path / "reports")) == 2 * 2
|
@ -1,9 +1,9 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from .util import working_directory
|
||||
|
||||
from ..cli import process
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
@pytest.mark.integration
|
@ -1,21 +1,20 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
from .util import working_directory
|
||||
|
||||
|
||||
from ..ocrd_cli import ocrd_dinglehopper
|
||||
from .util import working_directory
|
||||
|
||||
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(sys.platform == 'win32', reason="only on unix")
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="only on unix")
|
||||
def test_ocrd_cli(tmp_path):
|
||||
"""Test OCR-D interface"""
|
||||
|
@ -0,0 +1,110 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import cli_summarize
|
||||
from .util import working_directory
|
||||
|
||||
expected_cer_avg = (0.05 + 0.10) / 2
|
||||
expected_wer_avg = (0.15 + 0.20) / 2
|
||||
expected_diff_c = {"a": 30, "b": 50}
|
||||
expected_diff_w = {"c": 70, "d": 90}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_summaries(tmp_path):
|
||||
"""Create two summary reports with mock data"""
|
||||
reports_dirname = tmp_path / "reports"
|
||||
reports_dirname.mkdir()
|
||||
|
||||
report1 = {
|
||||
"cer": 0.05,
|
||||
"wer": 0.15,
|
||||
"differences": {
|
||||
"character_level": {"a": 10, "b": 20},
|
||||
"word_level": {"c": 30, "d": 40},
|
||||
},
|
||||
}
|
||||
report2 = {
|
||||
"cer": 0.10,
|
||||
"wer": 0.20,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50},
|
||||
},
|
||||
}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report1.json"), "w") as f:
|
||||
json.dump(report1, f)
|
||||
with open(os.path.join(reports_dirname, "report2.json"), "w") as f:
|
||||
json.dump(report2, f)
|
||||
|
||||
return str(reports_dirname)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_json(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields a summarized JSON report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
with open(os.path.join(reports_dirname, "summary.json"), "r") as f:
|
||||
summary_data = json.load(f)
|
||||
|
||||
assert summary_data["num_reports"] == 2
|
||||
assert summary_data["cer_avg"] == expected_cer_avg
|
||||
assert summary_data["wer_avg"] == expected_wer_avg
|
||||
assert summary_data["differences"]["character_level"] == expected_diff_c
|
||||
assert summary_data["differences"]["word_level"] == expected_diff_w
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html(tmp_path, create_summaries):
|
||||
"""Test that the cli/process() yields an HTML report"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert len(contents) > 0
|
||||
assert "Number of reports: 2" in contents
|
||||
assert f"Average CER: {round(expected_cer_avg, 4)}" in contents
|
||||
assert f"Average WER: {round(expected_wer_avg, 4)}" in contents
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_cli_summarize_html_skip_invalid(tmp_path, create_summaries):
|
||||
"""
|
||||
Test that the cli/process() does not include reports that are missing a WER value.
|
||||
"""
|
||||
with working_directory(tmp_path):
|
||||
reports_dirname = create_summaries
|
||||
|
||||
# This third report has no WER value and should not be included in the summary
|
||||
report3 = {
|
||||
"cer": 0.10,
|
||||
"differences": {
|
||||
"character_level": {"a": 20, "b": 30},
|
||||
"word_level": {"c": 40, "d": 50},
|
||||
},
|
||||
}
|
||||
|
||||
with open(os.path.join(reports_dirname, "report3-missing-wer.json"), "w") as f:
|
||||
json.dump(report3, f)
|
||||
|
||||
cli_summarize.process(reports_dirname)
|
||||
|
||||
html_file = os.path.join(reports_dirname, "summary.html")
|
||||
assert os.path.isfile(html_file)
|
||||
|
||||
with open(html_file, "r") as f:
|
||||
contents = f.read()
|
||||
|
||||
assert "Number of reports: 2" in contents # report3 is not included
|
@ -1,8 +1,8 @@
|
||||
import os
|
||||
from itertools import zip_longest
|
||||
from typing import Iterable
|
||||
|
||||
import colorama
|
||||
import os
|
||||
|
||||
|
||||
def diffprint(x, y):
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue