Compare commits

...

167 Commits

Author SHA1 Message Date
Mike Gerber f6dfb77f94 🐛 pyproject.toml: Fix description 2 days ago
Mike Gerber ef817cb343 📦 v0.10.0 2 days ago
Mike Gerber b1c109baae
Merge pull request #128 from kba/v3-api
V3 api
2 days ago
Mike Gerber 13ab1ae150 🐛 Docker: Use same vendor as license for now 2 days ago
Mike Gerber d974369e13 🐛 Docker: Fix description 2 days ago
Mike Gerber b7bdca4ac8 🐛 Makefile: Make phony targets .PHONY 2 days ago
kba 831a24fc4c typo: report_prefix -> file_id 3 days ago
Konstantin Baierer f6a2c94520 ocrd_cli: but do check for existing output files
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
3 days ago
Konstantin Baierer 4162836612 ocrd_cli: no need to check fileGrp dir exists
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
3 days ago
Konstantin Baierer c0aa82d188 OCR-D processor: properly handle missing or non-downloaded GT/OCR file
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
3 days ago
kba 8c1b6d65f5 Dockerfile: build ocrd-all-tool.json 3 days ago
Mike Gerber f287386c0e 🧹Don't pin uniseg and rapidfuzz
Breakage with the newest uniseg API was fixed in master.

Can't see any issue with rapidfuzz, so removing that pin, too.
3 days ago
kba 63031b30bf Port to OCR-D/core API v3 3 days ago
Mike Gerber bf6633be02
Merge pull request #136 from qurator-spk/chore/update-liccheck
⚙  liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl)
3 days ago
Mike Gerber d3aa9eb520 ⚙ liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl) 3 days ago
Mike Gerber 625686f204
Merge pull request #135 from qurator-spk/chore/update-python-version
⚙  pyproject.toml: Update supported Python version
3 days ago
Mike Gerber ce7886af23 ⚙ pyproject.toml: Update supported Python version 3 days ago
Mike Gerber a09a624bde
Merge pull request #132 from qurator-spk/fix/uniseg-removed-index-parameter
🐛 Fix for changed API of uniseg's word_break
3 days ago
Mike Gerber badfa9c99e ⚙ GitHub Actions: Don't test on Python 3.8 anymore 3 days ago
Mike Gerber 7f8a8dd564 🐛 Fix for changed API of uniseg's word_break 3 days ago
Mike Gerber b72d4f5af9
Merge pull request #131 from qurator-spk/chore/update-pre-commit
⚙  pre-commit: update
3 days ago
Mike Gerber 058042accb ⚙ pre-commit: update 3 days ago
Mike Gerber 071e6a8bd1
Merge pull request #120 from joschrew/dockerfile
Add Dockerfile and Makefile to create ocr-d dockerimage
6 months ago
Mike Gerber 6b82293670
Update Dockerfile
I fancy-clicked @bertsky's change suggestion, which duplicated some labels. Now fancy-clicking the fix, fingers crossed...
6 months ago
Mike Gerber 6ecf49a355
Update Dockerfile
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
6 months ago
joschrew 9c7c104dce Add Dockerfile and Makefile to create ocr-d image 7 months ago
Mike Gerber 2e6fe0c279
Merge pull request #113 from qurator-spk/python-3.13
✔ Test on Python 3.13
8 months ago
Mike Gerber 1753ed4d13 ✔ Test on Python 3.13 8 months ago
Mike Gerber 3233dbcc8f ✔ pre-commit: Add license check 9 months ago
Mike Gerber f2e290dffe 🐛 Fix --version option in OCR-D CLI 9 months ago
Mike Gerber 6d1daf1dfe Support --version option in CLI 9 months ago
Mike Gerber 27ad145c7e ⚙ pyproject.toml: Add license.file 9 months ago
Mike Gerber 2e9e88cc1e ⚙ pre-commit: Update hooks 9 months ago
Mike Gerber 129e6eb427 📦 v0.9.7 9 months ago
Mike Gerber cf998443c1 ⚙ ruff: Update settings (select → lint.select) 9 months ago
Mike Gerber 6048107889 Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 9 months ago
Mike Gerber 2ee37ed4e3 🎨 Sort imports 9 months ago
Mike Gerber 521f034fba
Merge pull request #116 from stweil/master
Fix typo
9 months ago
Mike Gerber d1a2247615 ⚙ pre-commit: Update hooks 9 months ago
Mike Gerber 4047f8b6e5 🐛 Fix loading ocrd-tool.json for Python 3.12 9 months ago
Stefan Weil cd68a973cb Fix typo
Signed-off-by: Stefan Weil <sw@weilnetz.de>
11 months ago
Mike Gerber bc5818da9f ✔ GitHub Actions: Update used actions 11 months ago
Mike Gerber c91234daba ✔ GitHub Actions: Update used actions 11 months ago
Mike Gerber a534b5e28e ⚙ pre-commit: Update hooks 11 months ago
Mike Gerber b336f98271 🐛 Fix reading plain text files
As reported by @tallemeersch in gh-107, newlines were not removed for plain text files.
Fix this by stripping the lines as suggested.

Fixes gh-107.
12 months ago
Mike Gerber 41a0fad352 📦 v0.9.6 12 months ago
Mike Gerber e72d1e37ea Revert "✔ Test on Python 3.13"
This reverts commit 0d5c6d5a62.
12 months ago
Mike Gerber 86e723cd53 🐛 GHA: Install possible shapely build requirements (if building from source) 12 months ago
Mike Gerber dc4565fd2d
Merge pull request #111 from stweil/typos
Fix some typos (found by `codespell` and `typos`)
12 months ago
Mike Gerber fbcb9160fd 🐛 GHA: Install possible lxml build requirements (if building from source) 12 months ago
Mike Gerber 0d5c6d5a62 ✔ Test on Python 3.13 12 months ago
Mike Gerber e34adbf41c 🐛 Fix Python 3.12 support by requiring ocrd >= 2.65.0 12 months ago
Mike Gerber 58a688b175 ⚙ pre-commit: Update hooks 12 months ago
Stefan Weil 79701e410d Fix some typos (found by `codespell` and `typos`)
Signed-off-by: Stefan Weil <sw@weilnetz.de>
12 months ago
Mike Gerber 2383730a55 ✔ Test using empty files
Test edge cases + empty files, e.g. empty text content and a Unicode BOM character.

See also gh-79.
1 year ago
Mike Gerber 98d7928f45 ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber edabffec7e 🧹 tests: Move comment out of the code (bad style + weird formatting) 1 year ago
Mike Gerber 32d4037533 ⚙ cli: Annotate types in process_dir() 1 year ago
Mike Gerber fe1a713d55 ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber be7c1dd25d 🧹 Make from_text_segment()'s textequiv_level keyword-only 1 year ago
Mike Gerber 932bfafc7d 🧹 Make process_dir() keyword arguments keyword-only 1 year ago
Mike Gerber 945aec5673 ✒ README-DEV: Releasing a new version 1 year ago
Mike Gerber c29a80bc81 📦 v0.9.5 1 year ago
Mike Gerber a1c1d0ad49 ⚙ pre-commit: Add mypy dependencies
Closes gh-106.
1 year ago
Mike Gerber 5d9f0c482f 🐛 Check that we always get a valid ALTO namespace (satifies mypy) 1 year ago
Mike Gerber 19d1a00817 🎨 Reformat (Black) 1 year ago
Mike Gerber 4dc6b7dc04 ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber 6b3697c864 Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 1 year ago
Mike Gerber 4d4ead4cc8 🐛 Fix word segmentation with uniseg 0.8.0 1 year ago
Mike Gerber 0e3d24cac1
🐛 README.md: Fix badge (for real) 1 year ago
Mike Gerber 4016c01638
🐛 README.md: Fix test badge 1 year ago
Mike Gerber 4b64398cec 🚧 GitLab CI Test: Depend on child pipeline 1 year ago
Mike Gerber 7e033b6f03 🚧 GitLab CI Test: Depend on child pipeline 1 year ago
Mike Gerber 250ee2b7f2 🚧 GitLab CI Test: Push after pulling 1 year ago
Mike Gerber 76c4533aa5 🚧 GitLab CI Test: Push after pulling 1 year ago
Mike Gerber f8e31089b3 🚧 GitLab CI Test: Push after pulling 1 year ago
Mike Gerber 6cfb49fe39 🚧 GitLab CI Test: Push after pulling 1 year ago
Mike Gerber 5eba65f097 🚧 GitLab CI Test: Trigger only on default branch (and do not hardcode it) 1 year ago
Mike Gerber 83cef3106f 🚧 GitLab CI Test 1 year ago
Mike Gerber a95a85a889 🚧 GitLab CI Test 1 year ago
Mike Gerber ff34c65c1e 🔍 ruff: Remove ignore configuration, we use multimethods in a compatible way now 1 year ago
Mike Gerber 21c44d426e ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber 10ccba989e 🚧 GitLab CI Test 1 year ago
Mike Gerber 10d423f045 🚧 GitLab CI Test 1 year ago
Mike Gerber 6d947a9ca9 🚧 GitLab CI Test 1 year ago
Mike Gerber 484da90d27 🚧 GitLab CI Test 1 year ago
Mike Gerber d0ddfa68a1 🚧 GitLab CI Test 1 year ago
Mike Gerber 81391132f0 🚧 GitLab CI Test 1 year ago
Mike Gerber dc390cd3f8 🚧 GitLab CI Test 1 year ago
Mike Gerber c77e8f51ab 🚧 GitLab CI Test 1 year ago
Mike Gerber e083688c66 🚧 GitLab CI Test 1 year ago
Mike Gerber 6d8afc27b3 🚧 GitLab CI Test 1 year ago
Mike Gerber af83b35f23 🚧 GitLab CI Test 1 year ago
Mike Gerber 344f96dca9 🚧 GitLab CI Test 1 year ago
Mike Gerber 483e809691 🔍 mypy: Use an almost strict mypy configuration, and fix any issues 1 year ago
Mike Gerber ad316aeabc 🔍 mypy: Use a compatible syntax for multimethod 1 year ago
Mike Gerber 8166435958 🔍 mypy: Remove ExtractedText.segments converter 1 year ago
Mike Gerber 24c25b6fcd 🔍 mypy: Avoid using check() for all attr validators 1 year ago
Mike Gerber ac9d360dcd 🔍 mypy: Make cli.process() typed so mypy checks it (and issues no warning) 1 year ago
Mike Gerber 788868b2ac Merge branch 'pr103' 1 year ago
Mike Gerber 59a3882ce5 🧹 GitHub Actions: Clean up whitespace 1 year ago
Sadra Barikbin 4466422cda Fix a typo 1 year ago
Sadra Barikbin 967f833eac Improve report 1 year ago
Sadra Barikbin f4ff6a8f31 Change reporter 1 year ago
Sadra Barikbin 4413ddac8f Temporary commit 1 year ago
Sadra Barikbin 6884c5c825 Update dorny dependency 1 year ago
Sadra Barikbin c90a61c12c Fix a few typos 1 year ago
Sadra Barikbin bf47308c00 Add report_tests workflow 1 year ago
Mike Gerber 4bf123de43 ⚙ Update ruff+mypy dependencies 1 year ago
Mike Gerber b36727ed9e ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber 7a192880f1 ⬆ Move on to supporting Python >= 3.8 only 1 year ago
Mike Gerber c752793be6 🐛 Use typing.List instead of list, for Python <3.9 1 year ago
Mike Gerber 071766efc2 🐛 Use Optional instead of | none, for Python <3.10 1 year ago
Mike Gerber 4832d1542f ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber c1681551af 🐛 Fix generating word differences 1 year ago
Mike Gerber 44bd4b5eda ⚙ pre-commit: Update hooks 1 year ago
Mike Gerber 296a820990 Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 1 year ago
Mike Gerber 38fcbc8e1c Merge branch 'master' into performance 1 year ago
Mike Gerber d3fb3f96cf
Merge pull request #101 from sadra-barikbin/patch-1
Fix a tiny typo in Levenshtein notebook
1 year ago
Sadra Barikbin b0e906ad00
Update Levenshtein.ipynb
Fix a tiny typo in Levenshtein notebook.
1 year ago
Mike Gerber 68a12f8f7f ⬆ Update uniseg dependency
@maxbachmann also improved the performance of uniseg, and it is in 0.7.2 - update our
dependency.
1 year ago
Mike Gerber de6cd8f1e7 Make joining grapheme clusters more robust by checking joiner and handling an empty joiner 1 year ago
Mike Gerber 7c6ee593f0 🐛 Fix score_hint call in cli_line_dirs 1 year ago
Mike Gerber 618ea567de 🐛 Fix docstring of distance() for grapheme clusters 1 year ago
Mike Gerber e256526ea1 🐛 Fix calculation of score_hint for edge cases, e.g. when CER is infinite
If the CER is infinite, we can't calculate a score_hint as an int. Fall back to None
in this case.
1 year ago
Mike Gerber bc95c03127 🕸Do not use deprecated ID, pageId options
See gh-75.
2 years ago
Mike Gerber 7fef02bf0a ✔ Add mets:FLocat's @LOCTYPE/OTHERLOCTYPE to test data
Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the
test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so
adding it. This fixes the test again.
2 years ago
Mike Gerber 7ed076d3c1 ⬆ Update multimethod dependency
We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore,
so lifting the hard pin on multimethod 1.3.
2 years ago
Mike Gerber f077ce2e1b 🐛 dinglehopper-summarize: Handle reports without difference stats 2 years ago
Mike Gerber 39dc4186d6
Merge pull request #97 from qurator-spk/clean-remove-six-dep-again
🧹 Remove old six dependency (workaround for OCR-D/core#730)
2 years ago
Mike Gerber d776368484
Merge pull request #96 from qurator-spk/test-on-pr-but-really
🐛 (Hopefully) Fix running tests on PR
2 years ago
Mike Gerber 3f8c8e69aa 🐛 (Hopefully) Fix running tests on PR 2 years ago
Mike Gerber d8f84ec9ac 🧹 Remove old six dependency (workaround for OCR-D/core#730) 2 years ago
Mike Gerber df1d4d09f3
Merge pull request #94 from qurator-spk/test-on-pr
✔ GitHub Actions: Test on PR
2 years ago
Mike Gerber e7e0703d9d ✔ GitHub Actions: Test on PR 2 years ago
Mike Gerber 22e7247ac4
Merge pull request #93 from qurator-spk/update-dep-multimethod
⬆ Update multimethod dependency
2 years ago
Mike Gerber 1c3b28d873 ⬆ Update multimethod dependency
We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore,
so lifting the hard pin on multimethod 1.3.
2 years ago
Mike Gerber 05b5502c57
Merge pull request #92 from qurator-spk/update-pre-commit
Update pre commit
2 years ago
Mike Gerber fe60361e8d ✒ README-DEV: Make pre-commit section top-level (+ small whitespace fix) 2 years ago
Mike Gerber 8a1ea4ec93 🎨 Add newlines at end of files (ruff) 2 years ago
Mike Gerber 4e0d4dcf09 ⚙ pre-commit: Add pre-commit-update hook (to update hooks using pre-commit) 2 years ago
Mike Gerber 061ba16461 ⚙ pre-commit: Update hooks 2 years ago
Mike Gerber 0c727dca9d
Merge pull request #91 from qurator-spk/test-remove-circleci
✔ Remove CircleCI config
2 years ago
Mike Gerber 1b7c2a61a3 ✔ Remove CircleCI config 2 years ago
Mike Gerber 994a27d458
Merge pull request #90 from qurator-spk/test-on-python-3.12
✔ GitHub Actions: Test on Python 3.12
2 years ago
Mike Gerber 5450f193e4 ✔ GitHub Actions: Test on Python 3.12 2 years ago
Mike Gerber 9d862e418b ✔ Add mets:FLocat's @LOCTYPE/OTHERLOCTYPE to test data
Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the
test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so
adding it. This fixes the test again.
2 years ago
Mike Gerber dbaccdd5e3 ✒ README: Minor whitespace cleanup 2 years ago
Mike Gerber 54a3121172 ✒ README: Recommend installing via pip and from PyPI 2 years ago
Mike Gerber a1a7f95ac6 📦 v0.9.4 2 years ago
Mike Gerber 1e7c46285b 🎨 editorconfig: *.json should have a final newline 2 years ago
Mike Gerber 9594b4c9d2 🧹 pyproject: Remove extra *.json 2 years ago
Mike Gerber de70b198ac 🧹 Remove empty setup.cfg 2 years ago
Mike Gerber 6c70afbbc5 📦 v0.9.3 2 years ago
Mike Gerber 12b1ea3ae7 🐛 Remove MANIFEST.in workaround, now that setuptools_ocrd is fixed 2 years ago
Mike Gerber 98a67c7b3b 📦 v0.9.2 2 years ago
Mike Gerber 668072e338 🧹 .gitignore dist/ 2 years ago
Mike Gerber 563642c93b 🐛 Workaround sdist not containing top-level ocrd-tool.json
See https://github.com/qurator-spk/setuptools_ocrd/issues/10 - The sdist does not
contain ocrd-tool.json, so that the wheel built from it does not get the proper version.
Needs to be fixed in setuptools_ocrd, then MANIFEST.in can be removed again.
2 years ago
Gerber, Mike a18b25b163 🐛 Update tests for ExtractedText
In PR gh-72, @maxbachmann introduced a new argument for ExtractedText(). Update the
corresponding tests.
2 years ago
Max Bachmann f48e305347
use uniseg again 3 years ago
Max Bachmann d2bbc8a6c7 update rapidfuzz version 3 years ago
Max Bachmann a1f0a5e2d3 replace uniseg with uniseg2 3 years ago
Max Bachmann 22c3817f45 apply black 3 years ago
Max Bachmann 01571f23b7 move grapheme clusters to ExtractedText 3 years ago
Max Bachmann f211d09f56 remove python2.7 futures 3 years ago
Max Bachmann 205a969c0e remove unused includes 3 years ago
Max Bachmann f3825cdeb6
only call `words_normalized` once 3 years ago

@ -1,20 +0,0 @@
version: 2.1
jobs:
black:
parameters:
python-version:
type: string
docker:
- image: cimg/python:<< parameters.python-version >>
steps:
- checkout
- run: pip3 install --upgrade pip
- run: pip3 install black
- run: black .
workflows:
black:
jobs:
- black:
python-version: "3.11"

@ -0,0 +1,5 @@
src/dinglehopper/tests
dist
build
*.egg-info
.git

@ -15,7 +15,7 @@ indent_size = 2
[*.json]
indent_size = 2
insert_final_newline = false
insert_final_newline = true
# trailing spaces in markdown indicate word wrap
[*.md]

@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Upgrade pip
run: python3 -m pip install --upgrade pip
- name: Install setuptools
@ -32,7 +32,7 @@ jobs:
- name: Build package
run: python3 -m pip install --upgrade build && python3 -m build
- name: Upload dist
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: dist
path: dist/
@ -42,7 +42,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Download dist
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: dist
path: dist/
@ -61,7 +61,7 @@ jobs:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps:
- name: Download dist
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: dist
path: dist/

@ -1,4 +1,4 @@
name: test
name: Test
on:
@ -6,6 +6,10 @@ on:
branches:
- master
pull_request:
branches:
- master
schedule:
- cron: "00 16 07 * *" # = monthly
@ -21,30 +25,27 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
# For Python 3.6, we need to fall back to Ubuntu 20.04
runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
env:
test_results_dir: test-results-${{ matrix.python-version }}
runs-on: "ubuntu-latest"
steps:
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Install possible lxml build requirements (if building from source)
run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
- name: Install possible shapely build requirements (if building from source)
run: sudo apt-get install -y libgeos-dev
- name: Update pip
run: python3 -m pip install -U pip
- name: Avoid compiling OpenCV and NumPy on Python 3.6
run: |
if python3 --version | grep -q "Python 3.6"; then
pip install --prefer-binary -U opencv-python-headless numpy
fi
- name: Install requirements*.txt
run: |
for requirements_txt in requirements*.txt; do
@ -54,19 +55,10 @@ jobs:
- name: Test
run: |
cd src
mkdir -p ../$test_results_dir
python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
- name: Upload test results
uses: actions/upload-artifact@v3
if: success() || failure()
with:
name: ${{ env.test_results_dir }}
path: ${{ env.test_results_dir }}
- name: Report tests
uses: dorny/test-reporter@v1
uses: actions/upload-artifact@v4
if: success() || failure()
with:
name: Results on Python ${{ matrix.python-version }}
path: "${{env.test_results_dir }}/junit.xml"
reporter: java-junit
name: test-results-${{matrix.python-version}}
path: ${{matrix.python-version}}-junit.xml

@ -0,0 +1,20 @@
name: 'Test Report'
on:
workflow_run:
workflows: ['test']
types:
- completed
permissions:
contents: read
actions: read
checks: write
jobs:
report:
runs-on: ubuntu-latest
steps:
- uses: dorny/test-reporter@v1
with:
artifact: /test-results-(.*)/
name: 'Tests Results - $1'
path: '*junit.xml'
reporter: java-junit

1
.gitignore vendored

@ -28,3 +28,4 @@ dmypy.json
# Build artifacts
/build
/dist

@ -0,0 +1,16 @@
variables:
http_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
https_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
HTTP_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
HTTPS_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
stages:
- triggers
mirror:
stage: triggers
trigger:
include: .gitlab/mirror.yml
strategy: depend
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

@ -0,0 +1,47 @@
stages:
- check
- pull
- push
default:
image: debian
check:
stage: check
script:
- whoami; env
- if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
pull-gitlab:
stage: pull
script:
- echo "This is redundant"
pull-github:
stage: pull
before_script:
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
script:
- git remote remove github 2>/dev/null || true
- git remote add github https://github.com/qurator-spk/dinglehopper.git
- git remote -v
- git pull github "$CI_COMMIT_BRANCH"
push-gitlab:
stage: push
before_script:
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
script:
- git push origin "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
push-github:
stage: push
before_script:
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
script:
- git push github "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"

@ -1,8 +1,6 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
@ -13,17 +11,37 @@ repos:
- id: check-ast
- repo: https://github.com/psf/black
rev: 22.10.0
rev: 25.1.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.280
rev: v0.11.5
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- args:
- --fix
- --exit-non-zero-on-fix
id: ruff
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.4.1
rev: v1.15.0
hooks:
- id: mypy
- additional_dependencies:
- types-setuptools
- types-lxml
- numpy # for numpy plugin
- attrs
- multimethod
- rapidfuzz
id: mypy
- repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
rev: v0.6.1
hooks:
- id: pre-commit-update
- repo: https://github.com/dhatim/python-license-check
rev: 0.9.2
hooks:
- id: liccheck
language: system

@ -0,0 +1,38 @@
ARG DOCKER_BASE_IMAGE
FROM $DOCKER_BASE_IMAGE
ARG VCS_REF
ARG BUILD_DATE
LABEL \
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
org.label-schema.build-date=$BUILD_DATE \
org.opencontainers.image.vendor="qurator" \
org.opencontainers.image.title="dinglehopper" \
org.opencontainers.image.description="An OCR evaluation tool" \
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
org.opencontainers.image.revision=$VCS_REF \
org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.base.name=ocrd/core
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
# avoid the need for an extra volume for persistent resource user db
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
WORKDIR /build/dinglehopper
COPY . .
COPY ocrd-tool.json .
# prepackage ocrd-tool.json as ocrd-all-tool.json
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
RUN make install && rm -rf /build/dinglehopper
WORKDIR /data
VOLUME /data

@ -0,0 +1,33 @@
PYTHON = python3
PIP = pip3
PYTHONIOENCODING=utf8
PYTEST_ARGS = -vv
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
DOCKER_TAG = ocrd/dinglehopper
help:
@echo
@echo " Targets"
@echo
@echo " install Install full Python package via pip"
@echo " docker Build the ocrd/dinglehopper docker image"
# Install Python package via pip
install:
$(PIP) install .
install-dev:
$(PIP) install -e .
test:
pytest $(PYTEST_ARGS)
docker:
docker build \
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .
.PHONY: help install install-dev test docker

@ -10,6 +10,7 @@ pytest
```
## Test running examples
Only unit tests:
```bash
pytest -m "not integration"
@ -36,9 +37,21 @@ pytest -k "not test" --mypy
pytest -k "not test" --ruff
```
## How to use pre-commit
# How to use pre-commit
This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
- Install pre-commit, e.g. `pip install -r requirements-dev.txt`
- Install the repo-local git hooks: `pre-commit install`
# Releasing a new version
- Update `ocrd-tool.json`
- `git commit`
- `git tag vx.y.z`
- `git push && git push --tags`
- The GitHub Actions workflow `release` will now create
a. a new release on GitHub and
b. a new release on PyPI
- Currently requires a review for PYPI?

@ -8,7 +8,7 @@ compares a ground truth (GT) document page with a OCR result page to compute
metrics and a word/character differences report. It also supports batch processing by
generating, aggregating and summarizing multiple reports.
[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
[![Tests](https://github.com/qurator-spk/dinglehopper/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
[![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
[![License](https://img.shields.io/badge/License-Apache-blue)](#license)
[![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)
@ -23,10 +23,11 @@ Goals
Installation
------------
It's best to use pip, e.g.:
~~~
sudo pip install .
~~~
It's best to use pip to install the package from PyPI, e.g.:
```
pip install dinglehopper
```
Usage
-----
@ -99,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.
If you are summarizing many reports and have used the `--differences` flag while
generating them, it may be useful to limit the number of differences reported by using
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML
report, making it easier to open and navigate. Note that the JSON report will still
contain all differences. Example:
~~~
dinglehopper-summarize output_folder/ --occurences-threshold 10
dinglehopper-summarize output_folder/ --occurrences-threshold 10
~~~
### dinglehopper-line-dirs

@ -7,9 +7,10 @@ authors = [
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
]
description = "The OCR evaluation tool"
description = "An OCR evaluation tool"
readme = "README.md"
requires-python = ">=3.6"
license.file = "LICENSE"
requires-python = ">=3.9"
keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
dynamic = ["version", "dependencies", "optional-dependencies"]
@ -48,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
where = ["src"]
[tool.setuptools.package-data]
dinglehopper = ["*.json", "templates/*"]
dinglehopper = ["templates/*", "*.json"]
[tool.pytest.ini_options]
@ -60,11 +61,54 @@ markers = [
[tool.mypy]
plugins = ["numpy.typing.mypy_plugin"]
ignore_missing_imports = true
[tool.ruff]
strict = true
disallow_subclassing_any = false
# ❗ error: Class cannot subclass "Processor" (has type "Any")
disallow_any_generics = false
disallow_untyped_defs = false
disallow_untyped_calls = false
[tool.ruff.lint]
select = ["E", "F", "I"]
ignore = [
"F811", # multimethods are considered redefinitions by ruff
[tool.liccheck]
authorized_licenses = [
"bsd",
"new bsd",
"bsd license",
"new bsd license",
"simplified bsd",
"apache",
"apache 2.0",
"apache software license",
"apache software",
"apache license 2.0",
"gnu lgpl",
"lgpl with exceptions or zpl",
"GNU Library or Lesser General Public License (LGPL)",
"GNU Lesser General Public License v3 (LGPLv3)",
"GNU Lesser General Public License v2 or later (LGPLv2+)",
"mit",
"mit license",
"mit-cmu",
"python software foundation",
"psf",
"psf-2.0",
"Historical Permission Notice and Disclaimer (HPND)",
"public domain",
'The Unlicense (Unlicense)',
"isc",
"ISC License (ISCL)",
'Mozilla Public License 2.0 (MPL 2.0)',
]
unauthorized_licenses = [
"gpl v3",
]

@ -1,8 +1,14 @@
pytest
pytest-cov
pytest-mypy
black
pre-commit
ruff ; python_version >= "3.7"
pytest-ruff ; python_version >= "3.7"
ruff
pytest-ruff
mypy
types-lxml
types-setuptools
pytest-mypy
liccheck

@ -1,14 +1,14 @@
click
jinja2
lxml
uniseg
uniseg >= 0.9.1
numpy
colorama
MarkupSafe
ocrd >= 2.20.1
ocrd >= 3.3.0
attrs
multimethod == 1.3 # latest version to officially support Python 3.5
multimethod >= 1.3
tqdm
rapidfuzz >= 2.4.2
six # XXX workaround OCR-D/core#730
rapidfuzz >= 2.7.0
chardet
importlib_resources

@ -1,4 +1,4 @@
from .align import align, seq_align
from .align import align, score_hint, seq_align
from .character_error_rate import character_error_rate, character_error_rate_n
from .edit_distance import distance, editops
from .extracted_text import ExtractedText
@ -16,6 +16,7 @@ __all__ = [
"editops",
"distance",
"align",
"score_hint",
"seq_align",
"character_error_rate",
"character_error_rate_n",

@ -1,8 +1,10 @@
import math
import unicodedata
from math import ceil
from typing import Optional
from rapidfuzz.distance import Levenshtein
from .edit_distance import grapheme_clusters
from uniseg.graphemecluster import grapheme_clusters
def align(t1, t2):
@ -12,11 +14,27 @@ def align(t1, t2):
return seq_align(s1, s2)
def seq_align(s1, s2):
def score_hint(er: float, n: int) -> Optional[int]:
"""Calculate RapidFuzz score hint for a given error rate and count.
Gives the score hint for the distance functions (= expected distance) or None if
the error rate is inf.
"""
assert not math.isnan(er)
try:
score_hint = int(ceil(er * n))
except (OverflowError, ValueError):
# ceil(er * n) can be inf or NaN (for n == 0), so int() can throw an
# OverflowError and a ValueError.
score_hint = None
return score_hint
def seq_align(s1, s2, score_hint=None):
"""Align general sequences."""
s1 = list(s1)
s2 = list(s2)
ops = Levenshtein.editops(s1, s2)
ops = Levenshtein.editops(s1, s2, score_hint=score_hint)
i = 0
j = 0

@ -1,7 +1,5 @@
from __future__ import division
import unicodedata
from typing import Tuple
from typing import List, Tuple, TypeVar
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
@ -9,9 +7,13 @@ from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import distance
from .extracted_text import ExtractedText
T = TypeVar("T")
@multimethod
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
def character_error_rate_n(
reference: List[str], compared: List[str]
) -> Tuple[float, int]:
"""
Compute character error rate.
@ -19,7 +21,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
"""
d = distance(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
n = len(reference)
if d == 0:
return 0, n
@ -30,18 +32,28 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
# XXX Should we really count newlines here?
@multimethod
def character_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return character_error_rate_n(reference.text, compared.text)
@character_error_rate_n.register
def _(reference: str, compared: str) -> Tuple[float, int]:
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", reference)))
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", compared)))
cer, n = character_error_rate_n(seq1, seq2)
return cer, n
@character_error_rate_n.register
def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
cer, n = character_error_rate_n(
reference.grapheme_clusters, compared.grapheme_clusters
)
return cer, n
def character_error_rate(reference, compared) -> float:
def character_error_rate(reference: T, compared: T) -> float:
"""
Compute character error rate.
:return: character error rate
"""
cer: float
cer, _ = character_error_rate_n(reference, compared)
return cer

@ -1,13 +1,13 @@
import os
from collections import Counter
from typing import List
import click
from jinja2 import Environment, FileSystemLoader
from markupsafe import escape
from ocrd_utils import initLogging
from uniseg.graphemecluster import grapheme_clusters
from dinglehopper.align import seq_align
from dinglehopper.align import score_hint, seq_align
from dinglehopper.character_error_rate import character_error_rate_n
from dinglehopper.config import Config
from dinglehopper.extracted_text import ExtractedText
@ -15,7 +15,9 @@ from dinglehopper.ocr_files import extract
from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
def gen_diff_report(
gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
):
gtx = ""
ocrx = ""
@ -42,9 +44,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
if isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText):
raise TypeError()
# XXX splitting should be done in ExtractedText
gt_things = list(grapheme_clusters(gt_in.text))
ocr_things = list(grapheme_clusters(ocr_in.text))
gt_things = gt_in.grapheme_clusters
ocr_things = ocr_in.grapheme_clusters
else:
gt_things = gt_in
ocr_things = ocr_in
@ -53,7 +54,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
o_pos = 0
found_differences = []
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
css_classes = None
gt_id = None
ocr_id = None
@ -76,7 +77,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
if o is not None:
o_pos += len(o)
found_differences = dict(Counter(elem for elem in found_differences))
counted_differences = dict(Counter(elem for elem in found_differences))
return (
"""
@ -87,7 +88,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
""".format(
gtx, ocrx
),
found_differences,
counted_differences,
)
@ -105,15 +106,15 @@ def json_float(value):
def process(
gt,
ocr,
report_prefix,
reports_folder=".",
gt: str,
ocr: str,
report_prefix: str,
reports_folder: str = ".",
*,
metrics=True,
differences=False,
textequiv_level="region",
):
metrics: bool = True,
differences: bool = False,
textequiv_level: str = "region",
) -> None:
"""Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep
@ -122,22 +123,34 @@ def process(
gt_text = extract(gt, textequiv_level=textequiv_level)
ocr_text = extract(ocr, textequiv_level=textequiv_level)
gt_words: List[str] = list(words_normalized(gt_text))
ocr_words: List[str] = list(words_normalized(ocr_text))
assert isinstance(gt_text, ExtractedText)
assert isinstance(ocr_text, ExtractedText)
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
char_diff_report, diff_c = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·", differences=differences
gt_text,
ocr_text,
css_prefix="c",
joiner="",
none="·",
score_hint=score_hint(cer, n_characters),
differences=differences,
)
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
# {gt,ocr}_words must not be a generator, so we don't drain it for the differences
# report.
assert isinstance(gt_words, list)
assert isinstance(ocr_words, list)
wer, n_words = word_error_rate_n(gt_words, ocr_words)
word_diff_report, diff_w = gen_diff_report(
gt_words,
ocr_words,
css_prefix="w",
joiner=" ",
none="",
score_hint=score_hint(wer, n_words),
differences=differences,
)
@ -174,8 +187,15 @@ def process(
def process_dir(
gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level
):
gt: str,
ocr: str,
report_prefix: str,
reports_folder: str = ".",
*,
metrics: bool = True,
differences: bool = False,
textequiv_level: str = "region",
) -> None:
for gt_file in os.listdir(gt):
gt_file_path = os.path.join(gt, gt_file)
ocr_file_path = os.path.join(ocr, gt_file)
@ -214,6 +234,7 @@ def process_dir(
metavar="LEVEL",
)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
@click.version_option()
def main(
gt,
ocr,
@ -256,9 +277,9 @@ def main(
ocr,
report_prefix,
reports_folder,
metrics,
differences,
textequiv_level,
metrics=metrics,
differences=differences,
textequiv_level=textequiv_level,
)
else:
process(

@ -5,6 +5,7 @@ import click
from jinja2 import Environment, FileSystemLoader
from ocrd_utils import initLogging
from .align import score_hint
from .character_error_rate import character_error_rate_n
from .cli import gen_diff_report, json_float
from .ocr_files import plain_extract
@ -49,6 +50,8 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
ocr_text = plain_extract(
os.path.join(ocr_dir, ocr), include_filename_in_id=True
)
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
# Compute CER
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -62,7 +65,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_characters = n_characters + l_n_characters
# Compute WER
l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text)
l_wer, l_n_words = word_error_rate_n(gt_words, ocr_words)
if wer is None:
wer, n_words = l_wer, l_n_words
else:
@ -72,12 +75,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
# Generate diff reports
char_diff_report += gen_diff_report(
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·"
gt_text,
ocr_text,
css_prefix="l{0}-c".format(k),
joiner="",
none="·",
score_hint=score_hint(l_cer, l_n_characters),
)
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
word_diff_report += gen_diff_report(
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none=""
gt_words,
ocr_words,
css_prefix="l{0}-w".format(k),
joiner=" ",
none="",
score_hint=score_hint(l_wer, l_n_words),
)
env = Environment(

@ -1,5 +1,6 @@
import json
import os
from typing import Dict
import click
from jinja2 import Environment, FileSystemLoader
@ -13,8 +14,8 @@ def process(reports_folder, occurrences_threshold=1):
wer_list = []
cer_sum = 0
wer_sum = 0
diff_c = {}
diff_w = {}
diff_c: Dict[str, int] = {}
diff_w: Dict[str, int] = {}
for report in os.listdir(reports_folder):
if report.endswith(".json"):
@ -34,10 +35,15 @@ def process(reports_folder, occurrences_threshold=1):
cer_sum += cer
wer_sum += wer
for key, value in report_data["differences"]["character_level"].items():
try:
for key, value in report_data["differences"][
"character_level"
].items():
diff_c[key] = diff_c.get(key, 0) + value
for key, value in report_data["differences"]["word_level"].items():
diff_w[key] = diff_w.get(key, 0) + value
except KeyError:
pass
if len(cer_list) == 0:
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")

@ -1,6 +1,5 @@
from __future__ import division, print_function
import unicodedata
from typing import List
from multimethod import multimethod
from rapidfuzz.distance import Levenshtein
@ -10,7 +9,18 @@ from .extracted_text import ExtractedText
@multimethod
def distance(s1: str, s2: str):
def distance(seq1: List[str], seq2: List[str]) -> int:
"""Compute the Levenshtein edit distance between two lists of grapheme clusters.
This assumes that the grapheme clusters are already normalized.
Use distance(str, str) instead if you need to compare two Unicode strings.
"""
return Levenshtein.distance(seq1, seq2)
@distance.register
def _(s1: str, s2: str) -> int:
"""Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode
@ -22,9 +32,9 @@ def distance(s1: str, s2: str):
return Levenshtein.distance(seq1, seq2)
@multimethod
def distance(s1: ExtractedText, s2: ExtractedText):
return distance(s1.text, s2.text)
@distance.register
def _(s1: ExtractedText, s2: ExtractedText) -> int:
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
def editops(word1, word2):

@ -1,14 +1,16 @@
import enum
import functools
import re
import unicodedata
from contextlib import suppress
from itertools import repeat
from typing import Optional
from typing import Any, Dict, List, Optional
import attr
import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters
class Normalization(enum.Enum):
@ -120,7 +122,7 @@ class ExtractedText:
segment_id = attr.ib(type=Optional[str])
@segment_id.validator
def check(self, _, value):
def is_valid_segment_id(self, _, value):
if value is None:
return
if not re.match(r"[\w\d_-]+", value):
@ -130,33 +132,85 @@ class ExtractedText:
# a. _text itself
# b. or segments (ExtractedText) and a joiner
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list))
segments = attr.ib(type=Optional[List["ExtractedText"]])
joiner = attr.ib(type=Optional[str])
_text = attr.ib(type=Optional[str])
_grapheme_clusters = attr.ib(type=Optional[List[str]])
@segments.validator
def check(self, _, value):
def cant_set_both_segments_and_text(self, _, value):
if value is not None and self._text is not None:
raise ValueError("Can't have both segments and text")
@joiner.validator
def is_valid_joiner(self, _, value):
if self.segments is None:
if value is not None:
raise ValueError("Can't have joiner without segments to join")
if self.segments is not None:
if value not in ("", " ", "\n"):
raise ValueError(f"Unexpected segment joiner value {repr(value)}")
@_text.validator
def check(self, _, value):
if value is not None and self.segments is not None:
def is_valid_text(self, _, value):
if value is None:
return
if self.segments is not None:
raise ValueError("Can't have both segments and text")
if value is not None and unicodedata.normalize("NFC", value) != value:
if unicodedata.normalize("NFC", value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value:
if normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
if self._grapheme_clusters is None:
raise ValueError("Requires both text and grapheme clusters to be set")
@_grapheme_clusters.validator
def are_valid_grapheme_clusters(self, _, value):
if value is not None and self._text is None:
raise ValueError("Requires both text and grapheme clusters to be set")
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
@property
def text(self):
def text(self) -> str:
if self._text is not None:
return self._text
else:
assert self.joiner is not None and self.segments is not None
return self.joiner.join(s.text for s in self.segments)
@functools.cached_property
def _joiner_grapheme_cluster(self):
"""We need the joiner as a list of 0 or 1 grapheme clusters.
This property is cached.
"""
assert self.joiner is not None
if len(self.joiner) > 0:
joiner_grapheme_cluster = list(grapheme_clusters(self.joiner))
assert len(joiner_grapheme_cluster) == 1 # see joiner's check above
elif len(self.joiner) == 0:
joiner_grapheme_cluster = []
else:
joiner_grapheme_cluster = None
return joiner_grapheme_cluster
@property
def grapheme_clusters(self):
if self._text is not None:
return self._grapheme_clusters
else:
# TODO Test with text extracted at glyph level (joiner == "")
clusters = []
assert self.segments is not None
for seg in self.segments:
clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster
clusters = clusters[:-1]
return clusters
_segment_id_for_pos = None
def segment_id_for_pos(self, pos):
@ -167,6 +221,7 @@ class ExtractedText:
else:
# Recurse
segment_id_for_pos = []
assert self.joiner is not None and self.segments is not None
for s in self.segments:
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
segment_id_for_pos.extend(seg_ids)
@ -180,7 +235,7 @@ class ExtractedText:
return self._segment_id_for_pos[pos]
@classmethod
def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"):
def from_text_segment(cls, text_segment, nsmap, *, textequiv_level="region"):
"""Build an ExtractedText from a PAGE content text element"""
localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
@ -197,7 +252,8 @@ class ExtractedText:
# FIXME hardcoded SBB normalization
segment_text = normalize_sbb(segment_text)
segment_text = segment_text or ""
return cls(segment_id, None, None, segment_text)
clusters = list(grapheme_clusters(segment_text))
return cls(segment_id, None, None, segment_text, clusters)
else:
# Recurse
sub_localname = children_for_localname[localname]
@ -212,12 +268,15 @@ class ExtractedText:
)
)
joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None)
return cls(segment_id, segments, joiner, None, None)
@classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization)
clusters = list(grapheme_clusters(normalized_text))
return cls(
None, None, None, normalized_text, clusters, normalization=normalization
)
def invert_dict(d):
@ -225,7 +284,7 @@ def invert_dict(d):
return {v: k for k, v in d.items()}
def get_textequiv_unicode(text_segment, nsmap) -> str:
def get_textequiv_unicode(text_segment: Any, nsmap: Dict[str, str]) -> str:
"""Get the TextEquiv/Unicode text of the given PAGE text element."""
segment_id = text_segment.attrib["id"]
textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
@ -249,7 +308,7 @@ def get_first_textequiv(textequivs, segment_id):
if np.any(~nan_mask):
if np.any(nan_mask):
log.warning("TextEquiv without index in %s.", segment_id)
index = np.nanargmin(indices)
index = int(np.nanargmin(indices))
else:
# try ordering by conf
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
@ -258,7 +317,7 @@ def get_first_textequiv(textequivs, segment_id):
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id,
)
index = np.nanargmax(confidences)
index = int(np.nanargmax(confidences))
else:
# fallback to first entry in case of neither index or conf present
log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
@ -266,11 +325,11 @@ def get_first_textequiv(textequivs, segment_id):
return textequivs[index]
def get_attr(te, attr_name) -> float:
def get_attr(te: Any, attr_name: str) -> float:
"""Extract the attribute for the given name.
Note: currently only handles numeric values!
Other or non existend values are encoded as np.nan.
Other or non existent values are encoded as np.nan.
"""
attr_value = te.attrib.get(attr_name)
try:

@ -22,7 +22,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
"dinglehopper used to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
]
},
{
@ -391,7 +391,7 @@
"\\text{CER} = \\frac{i + s + d}{n}\n",
"$$\n",
"\n",
"where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
"where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)"
]
},
{
@ -680,7 +680,7 @@
" return cat in unwanted_categories or subcat in unwanted_subcategories\n",
"\n",
" # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
" # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
" # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n",
" for word in uniseg.wordbreak.words(s):\n",
" if all(unwanted(c) for c in word):\n",
" pass\n",

@ -1,44 +1,53 @@
from __future__ import division, print_function
import os
import sys
from typing import Iterator
from typing import Dict, Iterator, Optional
import chardet
from lxml import etree as ET
from lxml.etree import XMLSyntaxError
from uniseg.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb
def alto_namespace(tree: ET.ElementTree) -> str:
def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
"""Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root element has the
local name "alto". We do not check if the files uses any valid ALTO namespace.
local name "alto". We do not check if the file uses any valid ALTO namespace.
"""
root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "alto":
assert isinstance(root_name.namespace, str)
return root_name.namespace
else:
raise ValueError("Not an ALTO tree")
def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]:
nsmap = {"alto": alto_namespace(tree)}
def alto_nsmap(tree: ET._ElementTree) -> Dict[str, str]:
alto_ns = alto_namespace(tree)
if alto_ns is None:
raise ValueError("Could not determine ALTO namespace")
return {"alto": alto_ns}
def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
nsmap = alto_nsmap(tree)
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get("ID")
line_text = " ".join(
string.attrib.get("CONTENT")
string.attrib.get("CONTENT", "")
for string in line.iterfind("alto:String", namespaces=nsmap)
)
yield ExtractedText(line_id, None, None, normalize_sbb(line_text))
normalized_text = normalize_sbb(line_text)
clusters = list(grapheme_clusters(normalized_text))
yield ExtractedText(line_id, None, None, normalized_text, clusters)
# FIXME hardcoded SBB normalization
def alto_extract(tree: ET.ElementTree) -> ExtractedText:
def alto_extract(tree: ET._ElementTree) -> ExtractedText:
"""Extract text from the given ALTO ElementTree."""
return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None)
return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None, None)
def alto_text(tree):
@ -87,7 +96,7 @@ def page_extract(tree, *, textequiv_level="region"):
# Filter empty region texts
regions = [r for r in regions if r.text != ""]
return ExtractedText(None, regions, "\n", None)
return ExtractedText(None, regions, "\n", None, None)
def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
@ -97,7 +106,7 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
ro_children = list(group)
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
ro_children = [child for child in ro_children if "index" in child.attrib.keys()]
ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]:
ro_children = list(group)
@ -143,21 +152,25 @@ def detect_encoding(filename):
def plain_extract(filename, include_filename_in_id=False):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
fileencoding = detect_encoding(filename)
with open(filename, "r", encoding=fileencoding) as f:
def make_segment(no, line):
normalized_text = normalize_sbb(line)
clusters = list(grapheme_clusters(normalized_text))
return ExtractedText(
None,
[
ExtractedText(
id_template.format(filename=os.path.basename(filename), no=no),
None,
None,
normalize_sbb(line),
normalized_text,
clusters,
)
for no, line in enumerate(f.readlines())
],
fileencoding = detect_encoding(filename)
with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText(
None,
[make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
"\n",
None,
None,
)
# XXX hardcoded SBB normalization

@ -1,17 +1,13 @@
{
"version": "0.9.1",
"version": "0.10.0",
"git_url": "https://github.com/qurator-spk/dinglehopper",
"dockerhub": "ocrd/dinglehopper",
"tools": {
"ocrd-dinglehopper": {
"executable": "ocrd-dinglehopper",
"input_file_grp_cardinality": 2,
"output_file_grp_cardinality": 1,
"description": "Evaluate OCR text against ground truth with dinglehopper",
"input_file_grp": [
"OCR-D-GT-PAGE",
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-OCR-EVAL"
],
"categories": [
"Quality assurance"
],

@ -1,61 +1,55 @@
import json
from functools import cached_property
import os
from typing import Optional
import click
from ocrd_models import OcrdFileType
from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
from pkg_resources import resource_string
from ocrd_utils import make_file_id
from .cli import process as cli_process
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@click.command()
@ocrd_cli_options
def ocrd_dinglehopper(*args, **kwargs):
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def process(self):
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
assert_file_grp_cardinality(self.output_file_grp, 1)
@cached_property
def executable(self):
return 'ocrd-dinglehopper'
log = getLogger("processor.OcrdDinglehopperEvaluate")
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
assert self.parameter
metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",")
input_file_tuples = self.zip_input_files(on_error="abort")
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
# wrong number of inputs: let fail
gt_file, ocr_file = input_files
# missing on either side: skip (zip_input_files already warned)
if not gt_file or not ocr_file:
# file/page was not found in this group
continue
gt_file = self.workspace.download_file(gt_file)
ocr_file = self.workspace.download_file(ocr_file)
page_id = gt_file.pageId
return
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
if not gt_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
return
if not ocr_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
return
log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file)
page_id = gt_file.pageId
file_id = make_file_id(ocr_file, self.output_file_grp)
report_prefix = os.path.join(self.output_file_grp, file_id)
# Process the files
try:
os.mkdir(self.output_file_grp)
except FileExistsError:
pass
cli_process(
gt_file.local_filename,
ocr_file.local_filename,
report_prefix,
file_id,
self.output_file_grp,
metrics=metrics,
textequiv_level=textequiv_level,
)
@ -65,12 +59,16 @@ class OcrdDinglehopperEvaluate(Processor):
[".html", "text/html"],
[".json", "application/json"],
]:
output_file_id = file_id + report_suffix
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
self.workspace.add_file(
file_id=file_id + report_suffix,
file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix,
local_filename=file_id + report_suffix,
)

@ -138,17 +138,17 @@
<mets:fileSec>
<mets:fileGrp USE="OCR-D-GT-PAGE">
<mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/>
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-OCR-CALAMARI">
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/>
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-OCR-TESS">
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/>
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>

@ -13,12 +13,13 @@ def test_text():
test1 = ExtractedText(
None,
[
ExtractedText("s0", None, None, "foo"),
ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "bazinga"),
ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
ExtractedText("s2", None, None, "bazinga", grapheme_clusters("bazinga")),
],
" ",
None,
None,
)
assert test1.text == "foo bar bazinga"
@ -29,8 +30,20 @@ def test_text():
def test_normalization_check():
with pytest.raises(ValueError, match=r".*is not in NFC.*"):
ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ"))
assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ"))
ExtractedText(
"foo",
None,
None,
unicodedata.normalize("NFD", "Schlyñ"),
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
)
assert ExtractedText(
"foo",
None,
None,
unicodedata.normalize("NFC", "Schlyñ"),
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
)
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
@ -47,25 +60,27 @@ def test_align():
test1 = ExtractedText(
None,
[
ExtractedText("s0", None, None, "foo"),
ExtractedText("s1", None, None, "bar"),
ExtractedText("s2", None, None, "batzinga"),
ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
ExtractedText("s2", None, None, "batzinga", grapheme_clusters("batzinga")),
],
" ",
None,
None,
)
test2 = ExtractedText(
None,
[
ExtractedText("x0", None, None, "foo"),
ExtractedText("x1", None, None, "bar"),
ExtractedText("x0", None, None, "foo", grapheme_clusters("foo")),
ExtractedText("x1", None, None, "bar", grapheme_clusters("bar")),
# extra .
ExtractedText("x2", None, None, "."),
ExtractedText("x2", None, None, ".", grapheme_clusters(".")),
# deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText("x3", None, None, "bazim̃ga"),
ExtractedText("x3", None, None, "bazim̃ga", grapheme_clusters("bazim̃ga")),
],
" ",
None,
None,
)
left_pos = 0

@ -1,6 +1,8 @@
import math
import pytest
from .. import align, distance, seq_align
from .. import align, distance, score_hint, seq_align
from .util import unzip
@ -183,3 +185,8 @@ def test_lines_similar():
# Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0]
def test_score_hint():
assert score_hint(0.5, 23) == 12 # int(ceil())
assert score_hint(math.inf, 12345) is None

@ -21,9 +21,9 @@ def test_cli_directory(tmp_path):
os.path.join(data_dir, "directory-test", "ocr"),
"report",
str(tmp_path / "reports"),
False,
True,
"line",
metrics=False,
differences=True,
textequiv_level="line",
)
assert os.path.exists(tmp_path / "reports/1.xml-report.json")
@ -45,9 +45,9 @@ def test_cli_fail_without_gt(tmp_path):
os.path.join(data_dir, "directory-test", "ocr"),
"report",
str(tmp_path / "reports"),
False,
True,
"line",
metrics=False,
differences=True,
textequiv_level="line",
)
assert len(os.listdir(tmp_path / "reports")) == 2 * 2

@ -0,0 +1,35 @@
from __future__ import division, print_function
import math
import pytest
from .. import character_error_rate, plain_text
from .util import working_directory
@pytest.mark.integration
@pytest.mark.parametrize(
"gt_file_content,ocr_file_content,cer_expected",
[
("", "Lorem ipsum", math.inf),
("Lorem ipsum", "", 1.0),
("\ufeff", "Lorem ipsum", math.inf),
("Lorem ipsum", "\ufeff", 1.0),
("", "", 0.0),
("\ufeff", "", 0.0),
("", "\ufeff", 0.0),
],
)
def test_empty_files(tmp_path, gt_file_content, ocr_file_content, cer_expected):
with working_directory(tmp_path):
with open("gt.txt", "w") as gtf:
gtf.write(gt_file_content)
with open("ocr.txt", "w") as ocrf:
ocrf.write(ocr_file_content)
gt_text = plain_text("gt.txt")
ocr_text = plain_text("ocr.txt")
assert character_error_rate(gt_text, ocr_text) == cer_expected

@ -34,9 +34,8 @@ def test_ocrd_cli(tmp_path):
"-O",
"OCR-D-OCR-CALAMARI-EVAL",
]
sys.argv[
1:
] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
# Hack to satisfy ocrd_cli_wrap_processor() check for arguments
sys.argv[1:] = args
result = runner.invoke(ocrd_dinglehopper, args)
assert result.exit_code == 0
result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))

@ -177,8 +177,8 @@ def test_text():
def test_plain(tmp_path):
with working_directory(tmp_path):
with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB")
ocrf.write("First, a line.\nAnd a second line.\n")
result = plain_text("ocr.txt")
expected = "AAAAB"
expected = "First, a line.\nAnd a second line."
assert result == expected

@ -1,7 +1,5 @@
from __future__ import division
import unicodedata
from typing import Iterable, Tuple
from typing import Generator, Iterable, Tuple, TypeVar
import uniseg.wordbreak
from multimethod import multimethod
@ -9,6 +7,8 @@ from rapidfuzz.distance import Levenshtein
from .extracted_text import ExtractedText
T = TypeVar("T")
# Did we patch uniseg.wordbreak.word_break already?
word_break_patched = False
@ -22,11 +22,11 @@ def patch_word_break():
"""
old_word_break = uniseg.wordbreak.word_break
def new_word_break(c, index=0):
def new_word_break(c):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
return "ALetter"
return uniseg.wordbreak.Word_Break.ALetter
else:
return old_word_break(c, index)
return old_word_break(c)
uniseg.wordbreak.word_break = new_word_break
global word_break_patched
@ -34,7 +34,7 @@ def patch_word_break():
@multimethod
def words(s: str):
def words(s: str) -> Generator[str, None, None]:
"""Extract words from a string"""
global word_break_patched
@ -54,7 +54,7 @@ def words(s: str):
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
# word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
# only whitespace, punctation "or similar characters."
# only whitespace, punctuation "or similar characters."
for word in uniseg.wordbreak.words(s):
if all(unwanted(c) for c in word):
pass
@ -62,37 +62,37 @@ def words(s: str):
yield word
@multimethod
def words(s: ExtractedText):
return words(s.text)
@words.register
def _(s: ExtractedText) -> Generator[str, None, None]:
yield from words(s.text)
@multimethod
def words_normalized(s: str):
return words(unicodedata.normalize("NFC", s))
def words_normalized(s: str) -> Generator[str, None, None]:
yield from words(unicodedata.normalize("NFC", s))
@multimethod
def words_normalized(s: ExtractedText):
return words_normalized(s.text)
@words_normalized.register
def _(s: ExtractedText) -> Generator[str, None, None]:
yield from words_normalized(s.text)
@multimethod
def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
reference_seq = list(words_normalized(reference))
compared_seq = list(words_normalized(compared))
return word_error_rate_n(reference_seq, compared_seq)
wer, n = word_error_rate_n(reference_seq, compared_seq)
return wer, n
@multimethod
def word_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return word_error_rate_n(reference.text, compared.text)
@word_error_rate_n.register
def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
wer, n = word_error_rate_n(reference.text, compared.text)
return wer, n
@multimethod
def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
@word_error_rate_n.register
def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
reference_seq = list(reference)
compared_seq = list(compared)
@ -106,6 +106,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
return d / n, n
def word_error_rate(reference, compared) -> float:
def word_error_rate(reference: T, compared: T) -> float:
wer: float
wer, _ = word_error_rate_n(reference, compared)
return wer

Loading…
Cancel
Save