1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-05 16:39:59 +02:00

Compare commits

..

206 commits

Author SHA1 Message Date
3443edd6d3
Merge pull request #145 from bertsky/master
update docker
2025-05-13 12:41:50 +02:00
Robert Sachunsky
b1ef3af1a8 docker: use latest core base stage 2025-05-02 00:19:11 +02:00
Robert Sachunsky
d09e3969f8 docker: prepackage ocrd-all-module-dir.json 2025-05-02 00:19:11 +02:00
b5e99d96c9
Merge pull request #144 from qurator-spk/fix/make-test-results-clearer
✔  GitHub Actions: Make reporting results clearer
2025-04-25 11:31:29 +02:00
774790c36f ✔ GitHub Actions: Make reporting results clearer
In the "Actions" tab on GitHub, the workflow run that would post test results to the
_original_ workflow run is named "Test Report". This would lead me to click on it to see
the results, just to be disappointed.

This aims to make the naming of the GitHub workflows/jobs clearer.
2025-04-25 11:20:00 +02:00
addb572922
Merge pull request #143 from qurator-spk/chore/update-pre-commit
⚙  pre-commit: update
2025-04-25 10:14:30 +02:00
1ebb004386 ⚙ pre-commit: update 2025-04-25 10:13:06 +02:00
c3aa48ec3b Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 2025-04-24 17:16:06 +02:00
628594ef98 📦 v0.11.0 2025-04-24 17:14:44 +02:00
d7814db705
Merge pull request #142 from qurator-spk/feat/flex-line-dirs
Feat/flex line dirs
2025-04-24 16:48:22 +02:00
5639f3db7f ✔ Add a tests that checks if plain text files with BOM are read correctly 2025-04-24 16:44:29 +02:00
9fc8937324 ✒ README: Mention dinglehopper-line-dirs --help 2025-04-24 15:13:19 +02:00
14a4bc56d8 🐛 Add --plain-encoding option to dinglehopper-extract 2025-04-22 18:24:35 +02:00
a70260c10e 🐛 Use warning() to fix DeprecationWarning 2025-04-22 13:57:19 +02:00
224aa02163 🚧 Fix help text 2025-04-22 13:57:19 +02:00
9db5b4caf5 🚧 Add OCR-D parameter for plain text encoding 2025-04-22 13:57:19 +02:00
5578ce83a3 🚧 Add option for text encoding to line dir cli 2025-04-22 13:57:19 +02:00
cf59b951a3 🚧 Add option for text encoding to line dir cli 2025-04-22 13:57:19 +02:00
480b3cf864 ✔ Test that CLI produces a complete HTML report 2025-04-22 13:57:19 +02:00
f1a586cff1 ✔ Test line dirs CLI 2025-04-22 13:57:18 +02:00
3b16c14c16 ✔ Properly test line dir finding 2025-04-22 13:57:18 +02:00
322faeb26c 🎨 Sort imports 2025-04-22 13:57:18 +02:00
c37316da09 🐛 cli_line_dirs: Fix word differences section
At the time of generation of the section, the {gt,ocr}_words generators
were drained. Fix by using a list.

Fixes gh-124.
2025-04-22 13:57:18 +02:00
9414a92f9f 🐛 cli_line_dirs: Type-annotate functions 2025-04-22 13:57:18 +02:00
68344e48f8 🎨 Reformat cli_line_dirs 2025-04-22 13:57:18 +02:00
73ee16fe51 🚧 Support 'merged' GT+OCR line directories 2025-04-22 13:57:18 +02:00
6980d7a252 🚧 Use our own removesuffix() as we still support Python 3.8 2025-04-22 13:57:18 +02:00
2bf2529c38 🚧 Port new line dir functions 2025-04-22 13:57:17 +02:00
ad8e6de36b 🐛 cli_line_dirs: Fix character diff reports 2025-04-22 13:57:17 +02:00
4024e350f7 🚧 Test new flexible line dirs functions 2025-04-22 13:57:17 +02:00
3c317cbeaf
Merge pull request #141 from qurator-spk/chore/update-pre-commit
⚙  pre-commit: update
2025-04-22 12:35:14 +02:00
d8403421fc ⚙ pre-commit: update 2025-04-22 12:30:47 +02:00
3305043234
Merge pull request #140 from qurator-spk/fix/vendor-strings
🐛 Fix vendor strings
2025-04-22 11:50:29 +02:00
6bf5bd7178 🐛 Fix vendor strings 2025-04-22 11:48:44 +02:00
817e0c95f7 📦 v0.10.1 2025-04-22 10:32:29 +02:00
3d7c7ee1e3
Merge pull request #139 from bertsky/allow-uniseg-py38
re-allow uniseg 0.8 and py38
2025-04-22 10:09:51 +02:00
Robert Sachunsky
a24623b966 re-allow py38 2025-04-17 16:47:13 +02:00
Robert Sachunsky
ea33602336 CI: reactivate py38 2025-04-17 16:12:42 +02:00
Robert Sachunsky
64444dd419 opt out of 7f8a8dd5 (uniseg update that requires py39) 2025-04-17 16:12:37 +02:00
f6dfb77f94 🐛 pyproject.toml: Fix description 2025-04-17 08:51:32 +02:00
ef817cb343 📦 v0.10.0 2025-04-17 08:37:37 +02:00
b1c109baae
Merge pull request #128 from kba/v3-api
V3 api
2025-04-17 08:34:51 +02:00
13ab1ae150 🐛 Docker: Use same vendor as license for now 2025-04-17 08:26:36 +02:00
d974369e13 🐛 Docker: Fix description 2025-04-17 08:10:56 +02:00
b7bdca4ac8 🐛 Makefile: Make phony targets .PHONY 2025-04-17 08:09:06 +02:00
kba
831a24fc4c typo: report_prefix -> file_id 2025-04-17 08:04:52 +02:00
Konstantin Baierer
f6a2c94520 ocrd_cli: but do check for existing output files
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
2025-04-17 08:04:52 +02:00
Konstantin Baierer
4162836612 ocrd_cli: no need to check fileGrp dir exists
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
2025-04-17 08:04:52 +02:00
Konstantin Baierer
c0aa82d188 OCR-D processor: properly handle missing or non-downloaded GT/OCR file
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
2025-04-17 08:04:51 +02:00
kba
8c1b6d65f5 Dockerfile: build ocrd-all-tool.json 2025-04-17 08:04:51 +02:00
f287386c0e 🧹Don't pin uniseg and rapidfuzz
Breakage with the newest uniseg API was fixed in master.

Can't see any issue with rapidfuzz, so removing that pin, too.
2025-04-16 14:49:23 +02:00
kba
63031b30bf Port to OCR-D/core API v3 2025-04-16 14:45:16 +02:00
bf6633be02
Merge pull request #136 from qurator-spk/chore/update-liccheck
⚙  liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl)
2025-04-16 11:13:02 +02:00
d3aa9eb520 ⚙ liccheck: update permissable licenses (mit-cmu, psf 2.0, iscl) 2025-04-16 11:09:33 +02:00
625686f204
Merge pull request #135 from qurator-spk/chore/update-python-version
⚙  pyproject.toml: Update supported Python version
2025-04-16 11:01:09 +02:00
ce7886af23 ⚙ pyproject.toml: Update supported Python version 2025-04-16 10:57:10 +02:00
a09a624bde
Merge pull request #132 from qurator-spk/fix/uniseg-removed-index-parameter
🐛 Fix for changed API of uniseg's word_break
2025-04-16 09:28:31 +02:00
badfa9c99e ⚙ GitHub Actions: Don't test on Python 3.8 anymore 2025-04-16 09:25:44 +02:00
7f8a8dd564 🐛 Fix for changed API of uniseg's word_break 2025-04-16 09:10:43 +02:00
b72d4f5af9
Merge pull request #131 from qurator-spk/chore/update-pre-commit
⚙  pre-commit: update
2025-04-16 09:06:05 +02:00
058042accb ⚙ pre-commit: update 2025-04-16 08:59:58 +02:00
071e6a8bd1
Merge pull request #120 from joschrew/dockerfile
Add Dockerfile and Makefile to create ocr-d dockerimage
2024-10-11 18:04:22 +02:00
6b82293670
Update Dockerfile
I fancy-clicked @bertsky's change suggestion, which duplicated some labels. Now fancy-clicking the fix, fingers crossed...
2024-10-07 17:41:59 +02:00
6ecf49a355
Update Dockerfile
Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
2024-10-07 17:39:42 +02:00
joschrew
9c7c104dce Add Dockerfile and Makefile to create ocr-d image 2024-10-02 15:29:36 +02:00
2e6fe0c279
Merge pull request #113 from qurator-spk/python-3.13
✔ Test on Python 3.13
2024-09-04 19:15:04 +02:00
1753ed4d13 ✔ Test on Python 3.13 2024-09-04 19:09:45 +02:00
3233dbcc8f ✔ pre-commit: Add license check 2024-07-22 16:54:33 +02:00
f2e290dffe 🐛 Fix --version option in OCR-D CLI 2024-07-19 14:54:46 +02:00
6d1daf1dfe Support --version option in CLI 2024-07-19 14:41:54 +02:00
27ad145c7e ⚙ pyproject.toml: Add license.file 2024-07-19 09:58:01 +02:00
2e9e88cc1e ⚙ pre-commit: Update hooks 2024-07-19 09:56:40 +02:00
129e6eb427 📦 v0.9.7 2024-07-11 17:25:38 +02:00
cf998443c1 ⚙ ruff: Update settings (select → lint.select) 2024-07-11 17:15:24 +02:00
6048107889 Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 2024-07-11 16:26:29 +02:00
2ee37ed4e3 🎨 Sort imports 2024-07-11 16:25:38 +02:00
521f034fba
Merge pull request #116 from stweil/master
Fix typo
2024-07-10 01:13:24 +02:00
d1a2247615 ⚙ pre-commit: Update hooks 2024-07-09 21:07:59 +02:00
4047f8b6e5 🐛 Fix loading ocrd-tool.json for Python 3.12 2024-07-09 21:01:31 +02:00
Stefan Weil
cd68a973cb Fix typo
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2024-05-26 09:18:00 +02:00
bc5818da9f ✔ GitHub Actions: Update used actions 2024-05-14 15:56:08 +02:00
c91234daba ✔ GitHub Actions: Update used actions 2024-05-13 21:17:42 +02:00
a534b5e28e ⚙ pre-commit: Update hooks 2024-05-13 21:16:29 +02:00
b336f98271 🐛 Fix reading plain text files
As reported by @tallemeersch in gh-107, newlines were not removed for plain text files.
Fix this by stripping the lines as suggested.

Fixes gh-107.
2024-05-06 18:14:16 +02:00
41a0fad352 📦 v0.9.6 2024-05-06 17:48:48 +02:00
e72d1e37ea Revert "✔ Test on Python 3.13"
This reverts commit 0d5c6d5a62.
2024-05-06 17:42:09 +02:00
86e723cd53 🐛 GHA: Install possible shapely build requirements (if building from source) 2024-05-06 17:25:59 +02:00
dc4565fd2d
Merge pull request #111 from stweil/typos
Fix some typos (found by `codespell` and `typos`)
2024-05-06 17:23:31 +02:00
fbcb9160fd 🐛 GHA: Install possible lxml build requirements (if building from source) 2024-05-06 17:09:05 +02:00
0d5c6d5a62 ✔ Test on Python 3.13 2024-05-06 17:02:52 +02:00
e34adbf41c 🐛 Fix Python 3.12 support by requiring ocrd >= 2.65.0 2024-05-06 16:59:18 +02:00
58a688b175 ⚙ pre-commit: Update hooks 2024-05-06 16:57:53 +02:00
Stefan Weil
79701e410d Fix some typos (found by codespell and typos)
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2024-04-29 08:42:17 +02:00
2383730a55 ✔ Test using empty files
Test edge cases + empty files, e.g. empty text content and a Unicode BOM character.

See also gh-79.
2024-04-08 20:33:03 +02:00
98d7928f45 ⚙ pre-commit: Update hooks 2024-04-08 20:27:47 +02:00
edabffec7e 🧹 tests: Move comment out of the code (bad style + weird formatting) 2024-04-04 19:46:08 +02:00
32d4037533 ⚙ cli: Annotate types in process_dir() 2024-04-04 19:38:27 +02:00
fe1a713d55 ⚙ pre-commit: Update hooks 2024-04-04 19:33:47 +02:00
be7c1dd25d 🧹 Make from_text_segment()'s textequiv_level keyword-only 2024-03-27 21:09:34 +01:00
932bfafc7d 🧹 Make process_dir() keyword arguments keyword-only 2024-03-27 19:44:09 +01:00
945aec5673 ✒ README-DEV: Releasing a new version 2024-03-27 19:01:49 +01:00
c29a80bc81 📦 v0.9.5 2024-03-27 18:49:13 +01:00
a1c1d0ad49 ⚙ pre-commit: Add mypy dependencies
Closes gh-106.
2024-03-27 18:32:49 +01:00
5d9f0c482f 🐛 Check that we always get a valid ALTO namespace (satifies mypy) 2024-03-27 17:57:53 +01:00
19d1a00817 🎨 Reformat (Black) 2024-03-27 17:36:05 +01:00
4dc6b7dc04 ⚙ pre-commit: Update hooks 2024-03-26 19:40:07 +01:00
6b3697c864 Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 2024-03-26 19:35:08 +01:00
4d4ead4cc8 🐛 Fix word segmentation with uniseg 0.8.0 2024-03-26 19:34:22 +01:00
0e3d24cac1
🐛 README.md: Fix badge (for real) 2024-03-26 19:01:25 +01:00
4016c01638
🐛 README.md: Fix test badge 2024-03-26 18:58:56 +01:00
4b64398cec 🚧 GitLab CI Test: Depend on child pipeline 2024-01-12 16:38:27 +01:00
7e033b6f03 🚧 GitLab CI Test: Depend on child pipeline 2024-01-12 16:35:08 +01:00
250ee2b7f2 🚧 GitLab CI Test: Push after pulling 2024-01-12 16:26:55 +01:00
76c4533aa5 🚧 GitLab CI Test: Push after pulling 2024-01-12 16:25:52 +01:00
f8e31089b3 🚧 GitLab CI Test: Push after pulling 2024-01-12 16:17:38 +01:00
6cfb49fe39 🚧 GitLab CI Test: Push after pulling 2024-01-12 16:14:22 +01:00
5eba65f097 🚧 GitLab CI Test: Trigger only on default branch (and do not hardcode it) 2024-01-12 16:08:26 +01:00
83cef3106f 🚧 GitLab CI Test 2024-01-12 12:46:40 +01:00
a95a85a889 🚧 GitLab CI Test 2024-01-12 12:45:32 +01:00
ff34c65c1e 🔍 ruff: Remove ignore configuration, we use multimethods in a compatible way now 2024-01-12 12:42:40 +01:00
21c44d426e ⚙ pre-commit: Update hooks 2024-01-12 12:38:47 +01:00
10ccba989e 🚧 GitLab CI Test 2024-01-11 20:43:34 +01:00
10d423f045 🚧 GitLab CI Test 2024-01-11 20:41:11 +01:00
6d947a9ca9 🚧 GitLab CI Test 2024-01-11 20:37:41 +01:00
484da90d27 🚧 GitLab CI Test 2024-01-11 20:34:07 +01:00
d0ddfa68a1 🚧 GitLab CI Test 2024-01-11 20:25:00 +01:00
81391132f0 🚧 GitLab CI Test 2024-01-11 20:22:28 +01:00
dc390cd3f8 🚧 GitLab CI Test 2024-01-11 20:17:10 +01:00
c77e8f51ab 🚧 GitLab CI Test 2024-01-11 20:09:30 +01:00
e083688c66 🚧 GitLab CI Test 2024-01-11 20:01:56 +01:00
6d8afc27b3 🚧 GitLab CI Test 2024-01-11 19:51:18 +01:00
af83b35f23 🚧 GitLab CI Test 2024-01-11 19:18:35 +01:00
344f96dca9 🚧 GitLab CI Test 2024-01-11 18:09:29 +00:00
483e809691 🔍 mypy: Use an almost strict mypy configuration, and fix any issues 2024-01-10 19:12:07 +01:00
ad316aeabc 🔍 mypy: Use a compatible syntax for multimethod 2024-01-09 15:58:29 +01:00
8166435958 🔍 mypy: Remove ExtractedText.segments converter 2024-01-08 19:33:25 +01:00
24c25b6fcd 🔍 mypy: Avoid using check() for all attr validators 2024-01-08 19:30:20 +01:00
ac9d360dcd 🔍 mypy: Make cli.process() typed so mypy checks it (and issues no warning) 2024-01-08 19:04:36 +01:00
788868b2ac Merge branch 'pr103' 2024-01-08 17:59:12 +01:00
59a3882ce5 🧹 GitHub Actions: Clean up whitespace 2024-01-08 17:57:51 +01:00
Sadra Barikbin
4466422cda Fix a typo 2024-01-08 17:56:32 +01:00
Sadra Barikbin
967f833eac Improve report 2024-01-08 17:56:32 +01:00
Sadra Barikbin
f4ff6a8f31 Change reporter 2024-01-08 17:56:32 +01:00
Sadra Barikbin
4413ddac8f Temporary commit 2024-01-08 17:56:32 +01:00
Sadra Barikbin
6884c5c825 Update dorny dependency 2024-01-08 17:56:31 +01:00
Sadra Barikbin
c90a61c12c Fix a few typos 2024-01-08 17:55:53 +01:00
Sadra Barikbin
bf47308c00 Add report_tests workflow 2024-01-08 17:51:37 +01:00
4bf123de43 ⚙ Update ruff+mypy dependencies 2024-01-08 17:45:02 +01:00
b36727ed9e ⚙ pre-commit: Update hooks 2024-01-08 17:43:48 +01:00
7a192880f1 ⬆ Move on to supporting Python >= 3.8 only 2024-01-03 20:58:24 +01:00
c752793be6 🐛 Use typing.List instead of list, for Python <3.9 2024-01-03 20:52:07 +01:00
071766efc2 🐛 Use Optional instead of | none, for Python <3.10 2024-01-03 20:40:06 +01:00
4832d1542f ⚙ pre-commit: Update hooks 2024-01-03 20:38:49 +01:00
c1681551af 🐛 Fix generating word differences 2024-01-03 19:21:53 +01:00
44bd4b5eda ⚙ pre-commit: Update hooks 2024-01-02 20:38:40 +01:00
296a820990 Merge branch 'master' of https://github.com/qurator-spk/dinglehopper 2024-01-02 20:23:12 +01:00
38fcbc8e1c Merge branch 'master' into performance 2024-01-02 20:22:38 +01:00
d3fb3f96cf
Merge pull request #101 from sadra-barikbin/patch-1
Fix a tiny typo in Levenshtein notebook
2023-12-21 17:31:25 +01:00
Sadra Barikbin
b0e906ad00
Update Levenshtein.ipynb
Fix a tiny typo in Levenshtein notebook.
2023-12-21 11:55:06 +03:30
68a12f8f7f ⬆ Update uniseg dependency
@maxbachmann also improved the performance of uniseg, and it is in 0.7.2 - update our
dependency.
2023-11-01 13:48:07 +01:00
de6cd8f1e7 Make joining grapheme clusters more robust by checking joiner and handling an empty joiner 2023-10-31 20:40:27 +01:00
7c6ee593f0 🐛 Fix score_hint call in cli_line_dirs 2023-10-31 19:13:19 +01:00
618ea567de 🐛 Fix docstring of distance() for grapheme clusters 2023-10-31 19:08:25 +01:00
e256526ea1 🐛 Fix calculation of score_hint for edge cases, e.g. when CER is infinite
If the CER is infinite, we can't calculate a score_hint as an int. Fall back to None
in this case.
2023-10-31 19:01:13 +01:00
bc95c03127 🕸Do not use deprecated ID, pageId options
See gh-75.
2023-10-27 19:26:01 +02:00
7fef02bf0a ✔ Add mets:FLocat's @LOCTYPE/OTHERLOCTYPE to test data
Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the
test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so
adding it. This fixes the test again.
2023-10-27 18:48:00 +02:00
7ed076d3c1 ⬆ Update multimethod dependency
We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore,
so lifting the hard pin on multimethod 1.3.
2023-10-27 18:43:27 +02:00
f077ce2e1b 🐛 dinglehopper-summarize: Handle reports without difference stats 2023-10-27 18:09:19 +02:00
39dc4186d6
Merge pull request #97 from qurator-spk/clean-remove-six-dep-again
🧹 Remove old six dependency (workaround for OCR-D/core#730)
2023-10-23 16:19:42 +02:00
d776368484
Merge pull request #96 from qurator-spk/test-on-pr-but-really
🐛 (Hopefully) Fix running tests on PR
2023-10-23 16:11:05 +02:00
3f8c8e69aa 🐛 (Hopefully) Fix running tests on PR 2023-10-23 16:07:44 +02:00
d8f84ec9ac 🧹 Remove old six dependency (workaround for OCR-D/core#730) 2023-10-23 15:53:14 +02:00
df1d4d09f3
Merge pull request #94 from qurator-spk/test-on-pr
✔ GitHub Actions: Test on PR
2023-10-23 15:49:33 +02:00
e7e0703d9d ✔ GitHub Actions: Test on PR 2023-10-23 15:45:20 +02:00
22e7247ac4
Merge pull request #93 from qurator-spk/update-dep-multimethod
⬆ Update multimethod dependency
2023-10-23 15:29:06 +02:00
1c3b28d873 ⬆ Update multimethod dependency
We had some issues while reviewing/rebasing #72. We don't support Python 3.5 anymore,
so lifting the hard pin on multimethod 1.3.
2023-10-23 15:26:20 +02:00
05b5502c57
Merge pull request #92 from qurator-spk/update-pre-commit
Update pre commit
2023-10-23 15:18:56 +02:00
fe60361e8d ✒ README-DEV: Make pre-commit section top-level (+ small whitespace fix) 2023-10-23 15:17:06 +02:00
8a1ea4ec93 🎨 Add newlines at end of files (ruff) 2023-10-23 15:15:00 +02:00
4e0d4dcf09 ⚙ pre-commit: Add pre-commit-update hook (to update hooks using pre-commit) 2023-10-23 15:08:16 +02:00
061ba16461 ⚙ pre-commit: Update hooks 2023-10-23 15:07:22 +02:00
0c727dca9d
Merge pull request #91 from qurator-spk/test-remove-circleci
✔ Remove CircleCI config
2023-10-23 14:38:25 +02:00
1b7c2a61a3 ✔ Remove CircleCI config 2023-10-23 14:34:33 +02:00
994a27d458
Merge pull request #90 from qurator-spk/test-on-python-3.12
✔ GitHub Actions: Test on Python 3.12
2023-10-23 14:14:35 +02:00
5450f193e4 ✔ GitHub Actions: Test on Python 3.12 2023-10-23 14:08:14 +02:00
9d862e418b ✔ Add mets:FLocat's @LOCTYPE/OTHERLOCTYPE to test data
Newest OCR-D wasn't happy with the test data anymore (see gh-89). I'm not sure if the
test data was invalid the way it was, but having a LOCTYPE certainly is "prettier" so
adding it. This fixes the test again.
2023-10-23 14:03:14 +02:00
dbaccdd5e3 ✒ README: Minor whitespace cleanup 2023-08-11 20:28:29 +02:00
54a3121172 ✒ README: Recommend installing via pip and from PyPI 2023-08-11 20:28:01 +02:00
a1a7f95ac6 📦 v0.9.4 2023-08-11 20:07:06 +02:00
1e7c46285b 🎨 editorconfig: *.json should have a final newline 2023-08-11 20:06:49 +02:00
9594b4c9d2 🧹 pyproject: Remove extra *.json 2023-08-11 20:04:35 +02:00
de70b198ac 🧹 Remove empty setup.cfg 2023-08-11 20:04:02 +02:00
6c70afbbc5 📦 v0.9.3 2023-08-11 19:53:04 +02:00
12b1ea3ae7 🐛 Remove MANIFEST.in workaround, now that setuptools_ocrd is fixed 2023-08-11 19:52:12 +02:00
98a67c7b3b 📦 v0.9.2 2023-08-04 20:35:42 +02:00
668072e338 🧹 .gitignore dist/ 2023-08-04 20:34:35 +02:00
563642c93b 🐛 Workaround sdist not containing top-level ocrd-tool.json
See https://github.com/qurator-spk/setuptools_ocrd/issues/10 - The sdist does not
contain ocrd-tool.json, so that the wheel built from it does not get the proper version.
Needs to be fixed in setuptools_ocrd, then MANIFEST.in can be removed again.
2023-08-04 20:31:32 +02:00
a18b25b163 🐛 Update tests for ExtractedText
In PR gh-72, @maxbachmann introduced a new argument for ExtractedText(). Update the
corresponding tests.
2023-01-27 19:13:45 +01:00
Max Bachmann
f48e305347
use uniseg again 2022-10-12 18:52:58 +02:00
Max Bachmann
d2bbc8a6c7 update rapidfuzz version 2022-09-11 02:38:32 +02:00
Max Bachmann
a1f0a5e2d3 replace uniseg with uniseg2 2022-08-29 22:08:25 +02:00
Max Bachmann
22c3817f45 apply black 2022-08-29 01:50:19 +02:00
Max Bachmann
01571f23b7 move grapheme clusters to ExtractedText 2022-08-29 01:49:04 +02:00
Max Bachmann
f211d09f56 remove python2.7 futures 2022-08-29 00:50:33 +02:00
Max Bachmann
205a969c0e remove unused includes 2022-08-29 00:48:40 +02:00
Max Bachmann
f3825cdeb6
only call words_normalized once 2022-08-29 00:22:23 +02:00
59 changed files with 1097 additions and 339 deletions

View file

@ -1,20 +0,0 @@
version: 2.1
jobs:
black:
parameters:
python-version:
type: string
docker:
- image: cimg/python:<< parameters.python-version >>
steps:
- checkout
- run: pip3 install --upgrade pip
- run: pip3 install black
- run: black .
workflows:
black:
jobs:
- black:
python-version: "3.11"

5
.dockerignore Normal file
View file

@ -0,0 +1,5 @@
src/dinglehopper/tests
dist
build
*.egg-info
.git

View file

@ -15,7 +15,7 @@ indent_size = 2
[*.json] [*.json]
indent_size = 2 indent_size = 2
insert_final_newline = false insert_final_newline = true
# trailing spaces in markdown indicate word wrap # trailing spaces in markdown indicate word wrap
[*.md] [*.md]

View file

@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Upgrade pip - name: Upgrade pip
run: python3 -m pip install --upgrade pip run: python3 -m pip install --upgrade pip
- name: Install setuptools - name: Install setuptools
@ -32,7 +32,7 @@ jobs:
- name: Build package - name: Build package
run: python3 -m pip install --upgrade build && python3 -m build run: python3 -m pip install --upgrade build && python3 -m build
- name: Upload dist - name: Upload dist
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
name: dist name: dist
path: dist/ path: dist/
@ -42,7 +42,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Download dist - name: Download dist
uses: actions/download-artifact@v3 uses: actions/download-artifact@v4
with: with:
name: dist name: dist
path: dist/ path: dist/
@ -61,7 +61,7 @@ jobs:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps: steps:
- name: Download dist - name: Download dist
uses: actions/download-artifact@v3 uses: actions/download-artifact@v4
with: with:
name: dist name: dist
path: dist/ path: dist/

View file

@ -1,4 +1,4 @@
name: test name: 'Test'
on: on:
@ -6,6 +6,10 @@ on:
branches: branches:
- master - master
pull_request:
branches:
- master
schedule: schedule:
- cron: "00 16 07 * *" # = monthly - cron: "00 16 07 * *" # = monthly
@ -21,30 +25,27 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10", "3.11" ] python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
# For Python 3.6, we need to fall back to Ubuntu 20.04 runs-on: "ubuntu-latest"
runs-on: ${{ matrix.python-version == '3.6' && 'ubuntu-20.04' || 'ubuntu-latest' }}
env:
test_results_dir: test-results-${{ matrix.python-version }}
steps: steps:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
allow-prereleases: true
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Install possible lxml build requirements (if building from source)
run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev
- name: Install possible shapely build requirements (if building from source)
run: sudo apt-get install -y libgeos-dev
- name: Update pip - name: Update pip
run: python3 -m pip install -U pip run: python3 -m pip install -U pip
- name: Avoid compiling OpenCV and NumPy on Python 3.6
run: |
if python3 --version | grep -q "Python 3.6"; then
pip install --prefer-binary -U opencv-python-headless numpy
fi
- name: Install requirements*.txt - name: Install requirements*.txt
run: | run: |
for requirements_txt in requirements*.txt; do for requirements_txt in requirements*.txt; do
@ -54,19 +55,10 @@ jobs:
- name: Test - name: Test
run: | run: |
cd src cd src
mkdir -p ../$test_results_dir python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy
python3 -m pytest --junitxml=../$test_results_dir/junit.xml -o junit_family=legacy
- name: Upload test results - name: Upload test results
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
if: success() || failure() if: success() || failure()
with: with:
name: ${{ env.test_results_dir }} name: test-results-${{matrix.python-version}}
path: ${{ env.test_results_dir }} path: ${{matrix.python-version}}-junit.xml
- name: Report tests
uses: dorny/test-reporter@v1
if: success() || failure()
with:
name: Results on Python ${{ matrix.python-version }}
path: "${{env.test_results_dir }}/junit.xml"
reporter: java-junit

20
.github/workflows/test_report.yml vendored Normal file
View file

@ -0,0 +1,20 @@
name: 'Test - Report results'
on:
workflow_run:
workflows: ['test']
types:
- completed
permissions:
contents: read
actions: read
checks: write
jobs:
report:
runs-on: ubuntu-latest
steps:
- uses: dorny/test-reporter@v1
with:
artifact: /test-results-(.*)/
name: 'test - Results ($1)'
path: '*junit.xml'
reporter: java-junit

2
.gitignore vendored
View file

@ -25,6 +25,8 @@ dmypy.json
# User-specific stuff # User-specific stuff
.idea .idea
.*.swp
# Build artifacts # Build artifacts
/build /build
/dist

16
.gitlab-ci.yml Normal file
View file

@ -0,0 +1,16 @@
variables:
http_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
https_proxy: "http://http-proxy.sbb.spk-berlin.de:3128/"
HTTP_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
HTTPS_PROXY: "http://http-proxy.sbb.spk-berlin.de:3128/"
stages:
- triggers
mirror:
stage: triggers
trigger:
include: .gitlab/mirror.yml
strategy: depend
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

47
.gitlab/mirror.yml Normal file
View file

@ -0,0 +1,47 @@
stages:
- check
- pull
- push
default:
image: debian
check:
stage: check
script:
- whoami; env
- if [ -z "$CI_COMMIT_BRANCH" ]; then echo "Not on a branch" >&2; exit 3; fi
pull-gitlab:
stage: pull
script:
- echo "This is redundant"
pull-github:
stage: pull
before_script:
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
script:
- git remote remove github 2>/dev/null || true
- git remote add github https://github.com/qurator-spk/dinglehopper.git
- git remote -v
- git pull github "$CI_COMMIT_BRANCH"
push-gitlab:
stage: push
before_script:
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
script:
- git push origin "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"
push-github:
stage: push
before_script:
- apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
script:
- git push github "$CI_COMMIT_SHA":"$CI_COMMIT_BRANCH"

View file

@ -1,8 +1,6 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0 rev: v5.0.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer - id: end-of-file-fixer
@ -13,17 +11,37 @@ repos:
- id: check-ast - id: check-ast
- repo: https://github.com/psf/black - repo: https://github.com/psf/black
rev: 22.10.0 rev: 25.1.0
hooks: hooks:
- id: black - id: black
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.280 rev: v0.11.7
hooks: hooks:
- id: ruff - args:
args: [--fix, --exit-non-zero-on-fix] - --fix
- --exit-non-zero-on-fix
id: ruff
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.4.1 rev: v1.15.0
hooks: hooks:
- id: mypy - additional_dependencies:
- types-setuptools
- types-lxml
- numpy # for numpy plugin
- attrs
- multimethod
- rapidfuzz
id: mypy
- repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
rev: v0.6.1
hooks:
- id: pre-commit-update
- repo: https://github.com/dhatim/python-license-check
rev: 0.9.2
hooks:
- id: liccheck
language: system

40
Dockerfile Normal file
View file

@ -0,0 +1,40 @@
ARG DOCKER_BASE_IMAGE
FROM $DOCKER_BASE_IMAGE
ARG VCS_REF
ARG BUILD_DATE
LABEL \
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
org.label-schema.vcs-ref=$VCS_REF \
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
org.label-schema.build-date=$BUILD_DATE \
org.opencontainers.image.vendor="Staatsbibliothek zu BerlinSPK" \
org.opencontainers.image.title="dinglehopper" \
org.opencontainers.image.description="An OCR evaluation tool" \
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
org.opencontainers.image.revision=$VCS_REF \
org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.base.name=ocrd/core
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
# avoid the need for an extra volume for persistent resource user db
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
WORKDIR /build/dinglehopper
COPY . .
COPY ocrd-tool.json .
# prepackage ocrd-tool.json as ocrd-all-tool.json
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
# prepackage ocrd-all-module-dir.json
RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
RUN make install && rm -rf /build/dinglehopper
WORKDIR /data
VOLUME /data

View file

@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier same "printed page" as the copyright notice for easier
identification within third-party archives. identification within third-party archives.
Copyright 2019 qurator Copyright 2019-2025 Staatsbibliothek zu BerlinSPK
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

34
Makefile Normal file
View file

@ -0,0 +1,34 @@
PYTHON = python3
PIP = pip3
PYTHONIOENCODING=utf8
PYTEST_ARGS = -vv
DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest
DOCKER_TAG ?= ocrd/dinglehopper
DOCKER ?= docker
help:
@echo
@echo " Targets"
@echo
@echo " install Install full Python package via pip"
@echo " docker Build the ocrd/dinglehopper docker image"
# Install Python package via pip
install:
$(PIP) install .
install-dev:
$(PIP) install -e .
test:
pytest $(PYTEST_ARGS)
docker:
$(DOCKER) build \
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .
.PHONY: help install install-dev test docker

View file

@ -10,6 +10,7 @@ pytest
``` ```
## Test running examples ## Test running examples
Only unit tests: Only unit tests:
```bash ```bash
pytest -m "not integration" pytest -m "not integration"
@ -36,9 +37,21 @@ pytest -k "not test" --mypy
pytest -k "not test" --ruff pytest -k "not test" --ruff
``` ```
## How to use pre-commit # How to use pre-commit
This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it: This project optionally uses [pre-commit](https://pre-commit.com) to check commits. To use it:
- Install pre-commit, e.g. `pip install -r requirements-dev.txt` - Install pre-commit, e.g. `pip install -r requirements-dev.txt`
- Install the repo-local git hooks: `pre-commit install` - Install the repo-local git hooks: `pre-commit install`
# Releasing a new version
- Update `ocrd-tool.json`
- `git commit`
- `git tag vx.y.z`
- `git push && git push --tags`
- The GitHub Actions workflow `release` will now create
a. a new release on GitHub and
b. a new release on PyPI
- Currently requires a review for PYPI?

View file

@ -8,7 +8,7 @@ compares a ground truth (GT) document page with a OCR result page to compute
metrics and a word/character differences report. It also supports batch processing by metrics and a word/character differences report. It also supports batch processing by
generating, aggregating and summarizing multiple reports. generating, aggregating and summarizing multiple reports.
[![Tests](https://github.com/qurator-spk/dinglehopper/workflows/test/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test") [![Tests](https://github.com/qurator-spk/dinglehopper/actions/workflows/test.yml/badge.svg)](https://github.com/qurator-spk/dinglehopper/actions?query=workflow:"test")
[![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/) [![GitHub tag](https://img.shields.io/github/tag/qurator-spk/dinglehopper?include_prereleases=&sort=semver&color=blue)](https://github.com/qurator-spk/dinglehopper/releases/)
[![License](https://img.shields.io/badge/License-Apache-blue)](#license) [![License](https://img.shields.io/badge/License-Apache-blue)](#license)
[![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues) [![issues - dinglehopper](https://img.shields.io/github/issues/qurator-spk/dinglehopper)](https://github.com/qurator-spk/dinglehopper/issues)
@ -23,10 +23,11 @@ Goals
Installation Installation
------------ ------------
It's best to use pip, e.g.:
~~~ It's best to use pip to install the package from PyPI, e.g.:
sudo pip install . ```
~~~ pip install dinglehopper
```
Usage Usage
----- -----
@ -99,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.
If you are summarizing many reports and have used the `--differences` flag while If you are summarizing many reports and have used the `--differences` flag while
generating them, it may be useful to limit the number of differences reported by using generating them, it may be useful to limit the number of differences reported by using
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML
report, making it easier to open and navigate. Note that the JSON report will still report, making it easier to open and navigate. Note that the JSON report will still
contain all differences. Example: contain all differences. Example:
~~~ ~~~
dinglehopper-summarize output_folder/ --occurences-threshold 10 dinglehopper-summarize output_folder/ --occurrences-threshold 10
~~~ ~~~
### dinglehopper-line-dirs ### dinglehopper-line-dirs
@ -111,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
CLI interface: CLI interface:
~~~ ```
dinglehopper-line-dirs gt/ ocr/ dinglehopper-line-dirs gt/ ocr/
~~~ ```
The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
in this case.
### dinglehopper-extract ### dinglehopper-extract
The tool `dinglehopper-extract` extracts the text of the given input file on The tool `dinglehopper-extract` extracts the text of the given input file on

View file

@ -7,9 +7,10 @@ authors = [
{name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
{name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
] ]
description = "The OCR evaluation tool" description = "An OCR evaluation tool"
readme = "README.md" readme = "README.md"
requires-python = ">=3.6" license.file = "LICENSE"
requires-python = ">=3.8"
keywords = ["qurator", "ocr", "evaluation", "ocr-d"] keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
dynamic = ["version", "dependencies", "optional-dependencies"] dynamic = ["version", "dependencies", "optional-dependencies"]
@ -48,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]}
where = ["src"] where = ["src"]
[tool.setuptools.package-data] [tool.setuptools.package-data]
dinglehopper = ["*.json", "templates/*"] dinglehopper = ["templates/*", "*.json"]
[tool.pytest.ini_options] [tool.pytest.ini_options]
@ -60,11 +61,54 @@ markers = [
[tool.mypy] [tool.mypy]
plugins = ["numpy.typing.mypy_plugin"]
ignore_missing_imports = true ignore_missing_imports = true
[tool.ruff] strict = true
disallow_subclassing_any = false
# ❗ error: Class cannot subclass "Processor" (has type "Any")
disallow_any_generics = false
disallow_untyped_defs = false
disallow_untyped_calls = false
[tool.ruff.lint]
select = ["E", "F", "I"] select = ["E", "F", "I"]
ignore = [
"F811", # multimethods are considered redefinitions by ruff
[tool.liccheck]
authorized_licenses = [
"bsd",
"new bsd",
"bsd license",
"new bsd license",
"simplified bsd",
"apache",
"apache 2.0",
"apache software license",
"apache software",
"apache license 2.0",
"gnu lgpl",
"lgpl with exceptions or zpl",
"GNU Library or Lesser General Public License (LGPL)",
"GNU Lesser General Public License v3 (LGPLv3)",
"GNU Lesser General Public License v2 or later (LGPLv2+)",
"mit",
"mit license",
"mit-cmu",
"python software foundation",
"psf",
"psf-2.0",
"Historical Permission Notice and Disclaimer (HPND)",
"public domain",
'The Unlicense (Unlicense)',
"isc",
"ISC License (ISCL)",
'Mozilla Public License 2.0 (MPL 2.0)',
]
unauthorized_licenses = [
"gpl v3",
] ]

View file

@ -1,8 +1,14 @@
pytest pytest
pytest-cov pytest-cov
pytest-mypy
black black
pre-commit pre-commit
ruff ; python_version >= "3.7" ruff
pytest-ruff ; python_version >= "3.7" pytest-ruff
mypy
types-lxml
types-setuptools
pytest-mypy
liccheck

View file

@ -1,14 +1,14 @@
click click
jinja2 jinja2
lxml lxml
uniseg uniseg >= 0.8.0
numpy numpy
colorama colorama
MarkupSafe MarkupSafe
ocrd >= 2.20.1 ocrd >= 3.3.0
attrs attrs
multimethod == 1.3 # latest version to officially support Python 3.5 multimethod >= 1.3
tqdm tqdm
rapidfuzz >= 2.4.2 rapidfuzz >= 2.7.0
six # XXX workaround OCR-D/core#730
chardet chardet
importlib_resources

View file

@ -1,4 +1,4 @@
from .align import align, seq_align from .align import align, score_hint, seq_align
from .character_error_rate import character_error_rate, character_error_rate_n from .character_error_rate import character_error_rate, character_error_rate_n
from .edit_distance import distance, editops from .edit_distance import distance, editops
from .extracted_text import ExtractedText from .extracted_text import ExtractedText
@ -16,6 +16,7 @@ __all__ = [
"editops", "editops",
"distance", "distance",
"align", "align",
"score_hint",
"seq_align", "seq_align",
"character_error_rate", "character_error_rate",
"character_error_rate_n", "character_error_rate_n",

View file

@ -1,8 +1,10 @@
import math
import unicodedata import unicodedata
from math import ceil
from typing import Optional
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import grapheme_clusters
def align(t1, t2): def align(t1, t2):
@ -12,11 +14,27 @@ def align(t1, t2):
return seq_align(s1, s2) return seq_align(s1, s2)
def seq_align(s1, s2): def score_hint(er: float, n: int) -> Optional[int]:
"""Calculate RapidFuzz score hint for a given error rate and count.
Gives the score hint for the distance functions (= expected distance) or None if
the error rate is inf.
"""
assert not math.isnan(er)
try:
score_hint = int(ceil(er * n))
except (OverflowError, ValueError):
# ceil(er * n) can be inf or NaN (for n == 0), so int() can throw an
# OverflowError and a ValueError.
score_hint = None
return score_hint
def seq_align(s1, s2, score_hint=None):
"""Align general sequences.""" """Align general sequences."""
s1 = list(s1) s1 = list(s1)
s2 = list(s2) s2 = list(s2)
ops = Levenshtein.editops(s1, s2) ops = Levenshtein.editops(s1, s2, score_hint=score_hint)
i = 0 i = 0
j = 0 j = 0

View file

@ -1,7 +1,5 @@
from __future__ import division
import unicodedata import unicodedata
from typing import Tuple from typing import List, Tuple, TypeVar
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
@ -9,9 +7,13 @@ from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import distance from .edit_distance import distance
from .extracted_text import ExtractedText from .extracted_text import ExtractedText
T = TypeVar("T")
@multimethod @multimethod
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: def character_error_rate_n(
reference: List[str], compared: List[str]
) -> Tuple[float, int]:
""" """
Compute character error rate. Compute character error rate.
@ -19,7 +21,7 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
""" """
d = distance(reference, compared) d = distance(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference)))) n = len(reference)
if d == 0: if d == 0:
return 0, n return 0, n
@ -30,18 +32,28 @@ def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
# XXX Should we really count newlines here? # XXX Should we really count newlines here?
@multimethod @character_error_rate_n.register
def character_error_rate_n( def _(reference: str, compared: str) -> Tuple[float, int]:
reference: ExtractedText, compared: ExtractedText seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", reference)))
) -> Tuple[float, int]: seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", compared)))
return character_error_rate_n(reference.text, compared.text) cer, n = character_error_rate_n(seq1, seq2)
return cer, n
def character_error_rate(reference, compared) -> float: @character_error_rate_n.register
def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
cer, n = character_error_rate_n(
reference.grapheme_clusters, compared.grapheme_clusters
)
return cer, n
def character_error_rate(reference: T, compared: T) -> float:
""" """
Compute character error rate. Compute character error rate.
:return: character error rate :return: character error rate
""" """
cer: float
cer, _ = character_error_rate_n(reference, compared) cer, _ = character_error_rate_n(reference, compared)
return cer return cer

View file

@ -1,13 +1,13 @@
import os import os
from collections import Counter from collections import Counter
from typing import List
import click import click
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from markupsafe import escape from markupsafe import escape
from ocrd_utils import initLogging from ocrd_utils import initLogging
from uniseg.graphemecluster import grapheme_clusters
from dinglehopper.align import seq_align from dinglehopper.align import score_hint, seq_align
from dinglehopper.character_error_rate import character_error_rate_n from dinglehopper.character_error_rate import character_error_rate_n
from dinglehopper.config import Config from dinglehopper.config import Config
from dinglehopper.extracted_text import ExtractedText from dinglehopper.extracted_text import ExtractedText
@ -15,7 +15,9 @@ from dinglehopper.ocr_files import extract
from dinglehopper.word_error_rate import word_error_rate_n, words_normalized from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False): def gen_diff_report(
gt_in, ocr_in, css_prefix, joiner, none, *, differences=False, score_hint=None
):
gtx = "" gtx = ""
ocrx = "" ocrx = ""
@ -42,9 +44,8 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
if isinstance(gt_in, ExtractedText): if isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText): if not isinstance(ocr_in, ExtractedText):
raise TypeError() raise TypeError()
# XXX splitting should be done in ExtractedText gt_things = gt_in.grapheme_clusters
gt_things = list(grapheme_clusters(gt_in.text)) ocr_things = ocr_in.grapheme_clusters
ocr_things = list(grapheme_clusters(ocr_in.text))
else: else:
gt_things = gt_in gt_things = gt_in
ocr_things = ocr_in ocr_things = ocr_in
@ -53,7 +54,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
o_pos = 0 o_pos = 0
found_differences = [] found_differences = []
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)):
css_classes = None css_classes = None
gt_id = None gt_id = None
ocr_id = None ocr_id = None
@ -76,7 +77,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
if o is not None: if o is not None:
o_pos += len(o) o_pos += len(o)
found_differences = dict(Counter(elem for elem in found_differences)) counted_differences = dict(Counter(elem for elem in found_differences))
return ( return (
""" """
@ -87,7 +88,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
""".format( """.format(
gtx, ocrx gtx, ocrx
), ),
found_differences, counted_differences,
) )
@ -105,39 +106,56 @@ def json_float(value):
def process( def process(
gt, gt: str,
ocr, ocr: str,
report_prefix, report_prefix: str,
reports_folder=".", reports_folder: str = ".",
*, *,
metrics=True, metrics: bool = True,
differences=False, differences: bool = False,
textequiv_level="region", textequiv_level: str = "region",
): plain_encoding: str = "autodetect",
) -> None:
"""Check OCR result against GT. """Check OCR result against GT.
The @click decorators change the signature of the decorated functions, so we keep The @click decorators change the signature of the decorated functions, so we keep
this undecorated version and use Click on a wrapper. this undecorated version and use Click on a wrapper.
""" """
gt_text = extract(gt, textequiv_level=textequiv_level) gt_text = extract(
ocr_text = extract(ocr, textequiv_level=textequiv_level) gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
)
ocr_text = extract(
ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
)
gt_words: List[str] = list(words_normalized(gt_text))
ocr_words: List[str] = list(words_normalized(ocr_text))
assert isinstance(gt_text, ExtractedText)
assert isinstance(ocr_text, ExtractedText)
cer, n_characters = character_error_rate_n(gt_text, ocr_text) cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
char_diff_report, diff_c = gen_diff_report( char_diff_report, diff_c = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·", differences=differences gt_text,
ocr_text,
css_prefix="c",
joiner="",
none="·",
score_hint=score_hint(cer, n_characters),
differences=differences,
) )
gt_words = words_normalized(gt_text) # {gt,ocr}_words must not be a generator, so we don't drain it for the differences
ocr_words = words_normalized(ocr_text) # report.
assert isinstance(gt_words, list)
assert isinstance(ocr_words, list)
wer, n_words = word_error_rate_n(gt_words, ocr_words)
word_diff_report, diff_w = gen_diff_report( word_diff_report, diff_w = gen_diff_report(
gt_words, gt_words,
ocr_words, ocr_words,
css_prefix="w", css_prefix="w",
joiner=" ", joiner=" ",
none="", none="",
score_hint=score_hint(wer, n_words),
differences=differences, differences=differences,
) )
@ -174,8 +192,16 @@ def process(
def process_dir( def process_dir(
gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level gt: str,
): ocr: str,
report_prefix: str,
reports_folder: str = ".",
*,
metrics: bool = True,
differences: bool = False,
textequiv_level: str = "region",
plain_encoding: str = "autodetect",
) -> None:
for gt_file in os.listdir(gt): for gt_file in os.listdir(gt):
gt_file_path = os.path.join(gt, gt_file) gt_file_path = os.path.join(gt, gt_file)
ocr_file_path = os.path.join(ocr, gt_file) ocr_file_path = os.path.join(ocr, gt_file)
@ -189,6 +215,7 @@ def process_dir(
metrics=metrics, metrics=metrics,
differences=differences, differences=differences,
textequiv_level=textequiv_level, textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
) )
else: else:
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@ -213,7 +240,13 @@ def process_dir(
help="PAGE TextEquiv level to extract text from", help="PAGE TextEquiv level to extract text from",
metavar="LEVEL", metavar="LEVEL",
) )
@click.option(
"--plain-encoding",
default="autodetect",
help='Encoding (e.g. "utf-8") of plain text files',
)
@click.option("--progress", default=False, is_flag=True, help="Show progress bar") @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
@click.version_option()
def main( def main(
gt, gt,
ocr, ocr,
@ -222,6 +255,7 @@ def main(
metrics, metrics,
differences, differences,
textequiv_level, textequiv_level,
plain_encoding,
progress, progress,
): ):
""" """
@ -256,9 +290,10 @@ def main(
ocr, ocr,
report_prefix, report_prefix,
reports_folder, reports_folder,
metrics, metrics=metrics,
differences, differences=differences,
textequiv_level, textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
) )
else: else:
process( process(
@ -269,6 +304,7 @@ def main(
metrics=metrics, metrics=metrics,
differences=differences, differences=differences,
textequiv_level=textequiv_level, textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
) )

View file

@ -12,7 +12,12 @@ from .ocr_files import extract
help="PAGE TextEquiv level to extract text from", help="PAGE TextEquiv level to extract text from",
metavar="LEVEL", metavar="LEVEL",
) )
def main(input_file, textequiv_level): @click.option(
"--plain-encoding",
default="autodetect",
help='Encoding (e.g. "utf-8") of plain text files',
)
def main(input_file, textequiv_level, plain_encoding):
""" """
Extract the text of the given INPUT_FILE. Extract the text of the given INPUT_FILE.
@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
use "--textequiv-level line" to extract from the level of TextLine tags. use "--textequiv-level line" to extract from the level of TextLine tags.
""" """
initLogging() initLogging()
input_text = extract(input_file, textequiv_level=textequiv_level).text input_text = extract(
input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
).text
print(input_text) print(input_text)

View file

@ -1,16 +1,53 @@
import itertools import itertools
import os import os
from typing import Callable, Iterator, List, Optional, Tuple
import click import click
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from ocrd_utils import initLogging from ocrd_utils import initLogging
from .align import score_hint
from .character_error_rate import character_error_rate_n from .character_error_rate import character_error_rate_n
from .cli import gen_diff_report, json_float from .cli import gen_diff_report, json_float
from .ocr_files import plain_extract from .ocr_files import plain_extract
from .word_error_rate import word_error_rate_n, words_normalized from .word_error_rate import word_error_rate_n, words_normalized
def removesuffix(text, suffix):
"""
Remove suffix from text.
Can be replaced with str.removesuffix when we only support Python >= 3.9.
"""
if suffix and text.endswith(suffix):
return text[: -len(suffix)]
return text
def is_hidden(filepath):
filename = os.path.basename(os.path.abspath(filepath))
return filename.startswith(".")
def find_all_files(
dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
) -> Iterator[str]:
"""
Find all files in dir_, returning filenames
If pred is given, pred(filename) must be True for the filename.
Does not return hidden files by default.
"""
for root, _, filenames in os.walk(dir_):
for fn in filenames:
if not return_hidden and is_hidden(fn):
continue
if pred and not pred(fn):
continue
yield os.path.join(root, fn)
def all_equal(iterable): def all_equal(iterable):
g = itertools.groupby(iterable) g = itertools.groupby(iterable)
return next(g, True) and not next(g, False) return next(g, True) and not next(g, False)
@ -24,15 +61,63 @@ def common_suffix(its):
return reversed(common_prefix(reversed(it) for it in its)) return reversed(common_prefix(reversed(it) for it in its))
def removesuffix(text, suffix): def find_gt_and_ocr_files(
if suffix and text.endswith(suffix): gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
return text[: -len(suffix)] ) -> Iterator[Tuple[str, str]]:
return text """
Find GT files and matching OCR files.
Returns pairs of GT and OCR files.
"""
for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
ocr_fn = os.path.join(
ocr_dir,
removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
)
if not os.path.exists(ocr_fn):
raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
yield gt_fn, ocr_fn
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
gt_suffix = "".join(common_suffix(os.listdir(gt_dir))) """
ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir))) Find GT files and matching OCR files, autodetect suffixes.
This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
files with a common suffix. Currently the files must have a suffix, e.g.
".gt.txt" (e.g. ".ocr.txt").
Returns pairs of GT and OCR files.
"""
# Autodetect suffixes
gt_files = find_all_files(gt_dir)
gt_suffix = "".join(common_suffix(gt_files))
if len(gt_suffix) == 0:
raise RuntimeError(
f"Files in GT directory {gt_dir} do not have a common suffix"
)
ocr_files = find_all_files(ocr_dir)
ocr_suffix = "".join(common_suffix(ocr_files))
if len(ocr_suffix) == 0:
raise RuntimeError(
f"Files in OCR directory {ocr_dir} do not have a common suffix"
)
yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
def process(
gt_dir,
ocr_dir,
report_prefix,
*,
metrics=True,
gt_suffix=None,
ocr_suffix=None,
plain_encoding="autodetect",
):
cer = None cer = None
n_characters = None n_characters = None
@ -41,14 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_words = None n_words = None
word_diff_report = "" word_diff_report = ""
for k, gt in enumerate(os.listdir(gt_dir)): if gt_suffix is not None and ocr_suffix is not None:
# Find a match by replacing the suffix gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
ocr = removesuffix(gt, gt_suffix) + ocr_suffix else:
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
ocr_text = plain_extract( gt_text = plain_extract(
os.path.join(ocr_dir, ocr), include_filename_in_id=True gt_fn, include_filename_in_id=True, encoding=plain_encoding
) )
ocr_text = plain_extract(
ocr_fn, include_filename_in_id=True, encoding=plain_encoding
)
gt_words: List[str] = list(words_normalized(gt_text))
ocr_words: List[str] = list(words_normalized(ocr_text))
# Compute CER # Compute CER
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@ -62,7 +153,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
n_characters = n_characters + l_n_characters n_characters = n_characters + l_n_characters
# Compute WER # Compute WER
l_wer, l_n_words = word_error_rate_n(gt_text, ocr_text) l_wer, l_n_words = word_error_rate_n(gt_words, ocr_words)
if wer is None: if wer is None:
wer, n_words = l_wer, l_n_words wer, n_words = l_wer, l_n_words
else: else:
@ -72,13 +163,21 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
# Generate diff reports # Generate diff reports
char_diff_report += gen_diff_report( char_diff_report += gen_diff_report(
gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" gt_text,
) ocr_text,
gt_words = words_normalized(gt_text) css_prefix="l{0}-c".format(k),
ocr_words = words_normalized(ocr_text) joiner="",
none="·",
score_hint=score_hint(l_cer, l_n_characters),
)[0]
word_diff_report += gen_diff_report( word_diff_report += gen_diff_report(
gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="" gt_words,
) ocr_words,
css_prefix="l{0}-w".format(k),
joiner=" ",
none="",
score_hint=score_hint(l_wer, l_n_words),
)[0]
env = Environment( env = Environment(
loader=FileSystemLoader( loader=FileSystemLoader(
@ -112,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
@click.option( @click.option(
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
) )
def main(gt, ocr, report_prefix, metrics): @click.option("--gt-suffix", help="Suffix of GT line text files")
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
@click.option(
"--plain-encoding",
default="autodetect",
help='Encoding (e.g. "utf-8") of plain text files',
)
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
""" """
Compare the GT line text directory against the OCR line text directory. Compare the GT line text directory against the OCR line text directory.
This assumes that the GT line text directory contains textfiles with a common This assumes that the GT line text directory contains textfiles with a common
suffix like ".gt.txt", and the OCR line text directory contains textfiles with suffix like ".gt.txt", and the OCR line text directory contains textfiles with
a common suffix like ".some-ocr.txt". The text files also need to be paired, a common suffix like ".some-ocr.txt". The text files also need to be paired,
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" i.e. the GT filename "line001.gt.txt" needs to match a filename
in the OCT lines directory. "line001.some-ocr.txt" in the OCR lines directory.
The GT and OCR directories are usually round truth line texts and the results of GT and OCR directories may contain line text files in matching subdirectories,
e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
GT and OCR directories can also be the same directory, but in this case you need
to give --gt-suffix and --ocr-suffix explicitly.
The GT and OCR directories are usually ground truth line texts and the results of
an OCR software, but you may use dinglehopper to compare two OCR results. In an OCR software, but you may use dinglehopper to compare two OCR results. In
that case, use --no-metrics to disable the then meaningless metrics and also that case, use --no-metrics to disable the then meaningless metrics and also
change the color scheme from green/red to blue. change the color scheme from green/red to blue.
@ -131,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
$REPORT_PREFIX defaults to "report". The reports include the character error $REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER). rate (CER) and the word error rate (WER).
It is recommended to specify the encoding of the text files, for example with
--plain-encoding utf-8. If this option is not given, we try to auto-detect it.
""" """
initLogging() initLogging()
process(gt, ocr, report_prefix, metrics=metrics) process(
gt,
ocr,
report_prefix,
metrics=metrics,
gt_suffix=gt_suffix,
ocr_suffix=ocr_suffix,
plain_encoding=plain_encoding,
)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,5 +1,6 @@
import json import json
import os import os
from typing import Dict
import click import click
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
@ -13,8 +14,8 @@ def process(reports_folder, occurrences_threshold=1):
wer_list = [] wer_list = []
cer_sum = 0 cer_sum = 0
wer_sum = 0 wer_sum = 0
diff_c = {} diff_c: Dict[str, int] = {}
diff_w = {} diff_w: Dict[str, int] = {}
for report in os.listdir(reports_folder): for report in os.listdir(reports_folder):
if report.endswith(".json"): if report.endswith(".json"):
@ -34,10 +35,15 @@ def process(reports_folder, occurrences_threshold=1):
cer_sum += cer cer_sum += cer
wer_sum += wer wer_sum += wer
for key, value in report_data["differences"]["character_level"].items(): try:
diff_c[key] = diff_c.get(key, 0) + value for key, value in report_data["differences"][
for key, value in report_data["differences"]["word_level"].items(): "character_level"
diff_w[key] = diff_w.get(key, 0) + value ].items():
diff_c[key] = diff_c.get(key, 0) + value
for key, value in report_data["differences"]["word_level"].items():
diff_w[key] = diff_w.get(key, 0) + value
except KeyError:
pass
if len(cer_list) == 0: if len(cer_list) == 0:
click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'") click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")

View file

@ -1,6 +1,5 @@
from __future__ import division, print_function
import unicodedata import unicodedata
from typing import List
from multimethod import multimethod from multimethod import multimethod
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
@ -10,7 +9,18 @@ from .extracted_text import ExtractedText
@multimethod @multimethod
def distance(s1: str, s2: str): def distance(seq1: List[str], seq2: List[str]) -> int:
"""Compute the Levenshtein edit distance between two lists of grapheme clusters.
This assumes that the grapheme clusters are already normalized.
Use distance(str, str) instead if you need to compare two Unicode strings.
"""
return Levenshtein.distance(seq1, seq2)
@distance.register
def _(s1: str, s2: str) -> int:
"""Compute the Levenshtein edit distance between two Unicode strings """Compute the Levenshtein edit distance between two Unicode strings
Note that this is different from levenshtein() as this function knows about Unicode Note that this is different from levenshtein() as this function knows about Unicode
@ -22,9 +32,9 @@ def distance(s1: str, s2: str):
return Levenshtein.distance(seq1, seq2) return Levenshtein.distance(seq1, seq2)
@multimethod @distance.register
def distance(s1: ExtractedText, s2: ExtractedText): def _(s1: ExtractedText, s2: ExtractedText) -> int:
return distance(s1.text, s2.text) return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
def editops(word1, word2): def editops(word1, word2):

View file

@ -1,14 +1,16 @@
import enum import enum
import functools
import re import re
import unicodedata import unicodedata
from contextlib import suppress from contextlib import suppress
from itertools import repeat from itertools import repeat
from typing import Optional from typing import Any, Dict, List, Optional
import attr import attr
import numpy as np import numpy as np
from lxml import etree as ET from lxml import etree as ET
from ocrd_utils import getLogger from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters
class Normalization(enum.Enum): class Normalization(enum.Enum):
@ -120,7 +122,7 @@ class ExtractedText:
segment_id = attr.ib(type=Optional[str]) segment_id = attr.ib(type=Optional[str])
@segment_id.validator @segment_id.validator
def check(self, _, value): def is_valid_segment_id(self, _, value):
if value is None: if value is None:
return return
if not re.match(r"[\w\d_-]+", value): if not re.match(r"[\w\d_-]+", value):
@ -130,33 +132,85 @@ class ExtractedText:
# a. _text itself # a. _text itself
# b. or segments (ExtractedText) and a joiner # b. or segments (ExtractedText) and a joiner
segments = attr.ib(type=Optional[list], converter=attr.converters.optional(list)) segments = attr.ib(type=Optional[List["ExtractedText"]])
joiner = attr.ib(type=Optional[str]) joiner = attr.ib(type=Optional[str])
_text = attr.ib(type=Optional[str]) _text = attr.ib(type=Optional[str])
_grapheme_clusters = attr.ib(type=Optional[List[str]])
@segments.validator @segments.validator
def check(self, _, value): def cant_set_both_segments_and_text(self, _, value):
if value is not None and self._text is not None: if value is not None and self._text is not None:
raise ValueError("Can't have both segments and text") raise ValueError("Can't have both segments and text")
@joiner.validator
def is_valid_joiner(self, _, value):
if self.segments is None:
if value is not None:
raise ValueError("Can't have joiner without segments to join")
if self.segments is not None:
if value not in ("", " ", "\n"):
raise ValueError(f"Unexpected segment joiner value {repr(value)}")
@_text.validator @_text.validator
def check(self, _, value): def is_valid_text(self, _, value):
if value is not None and self.segments is not None: if value is None:
return
if self.segments is not None:
raise ValueError("Can't have both segments and text") raise ValueError("Can't have both segments and text")
if value is not None and unicodedata.normalize("NFC", value) != value: if unicodedata.normalize("NFC", value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value)) raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value: if normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value)) raise ValueError('String "{}" is not normalized.'.format(value))
if self._grapheme_clusters is None:
raise ValueError("Requires both text and grapheme clusters to be set")
@_grapheme_clusters.validator
def are_valid_grapheme_clusters(self, _, value):
if value is not None and self._text is None:
raise ValueError("Requires both text and grapheme clusters to be set")
normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB) normalization = attr.ib(converter=Normalization, default=Normalization.NFC_SBB)
@property @property
def text(self): def text(self) -> str:
if self._text is not None: if self._text is not None:
return self._text return self._text
else: else:
assert self.joiner is not None and self.segments is not None
return self.joiner.join(s.text for s in self.segments) return self.joiner.join(s.text for s in self.segments)
@functools.cached_property
def _joiner_grapheme_cluster(self):
"""We need the joiner as a list of 0 or 1 grapheme clusters.
This property is cached.
"""
assert self.joiner is not None
if len(self.joiner) > 0:
joiner_grapheme_cluster = list(grapheme_clusters(self.joiner))
assert len(joiner_grapheme_cluster) == 1 # see joiner's check above
elif len(self.joiner) == 0:
joiner_grapheme_cluster = []
else:
joiner_grapheme_cluster = None
return joiner_grapheme_cluster
@property
def grapheme_clusters(self):
if self._text is not None:
return self._grapheme_clusters
else:
# TODO Test with text extracted at glyph level (joiner == "")
clusters = []
assert self.segments is not None
for seg in self.segments:
clusters += seg.grapheme_clusters + self._joiner_grapheme_cluster
clusters = clusters[:-1]
return clusters
_segment_id_for_pos = None _segment_id_for_pos = None
def segment_id_for_pos(self, pos): def segment_id_for_pos(self, pos):
@ -167,6 +221,7 @@ class ExtractedText:
else: else:
# Recurse # Recurse
segment_id_for_pos = [] segment_id_for_pos = []
assert self.joiner is not None and self.segments is not None
for s in self.segments: for s in self.segments:
seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))] seg_ids = [s.segment_id_for_pos(i) for i in range(len(s.text))]
segment_id_for_pos.extend(seg_ids) segment_id_for_pos.extend(seg_ids)
@ -180,7 +235,7 @@ class ExtractedText:
return self._segment_id_for_pos[pos] return self._segment_id_for_pos[pos]
@classmethod @classmethod
def from_text_segment(cls, text_segment, nsmap, textequiv_level="region"): def from_text_segment(cls, text_segment, nsmap, *, textequiv_level="region"):
"""Build an ExtractedText from a PAGE content text element""" """Build an ExtractedText from a PAGE content text element"""
localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"} localname_for_textequiv_level = {"region": "TextRegion", "line": "TextLine"}
@ -197,7 +252,8 @@ class ExtractedText:
# FIXME hardcoded SBB normalization # FIXME hardcoded SBB normalization
segment_text = normalize_sbb(segment_text) segment_text = normalize_sbb(segment_text)
segment_text = segment_text or "" segment_text = segment_text or ""
return cls(segment_id, None, None, segment_text) clusters = list(grapheme_clusters(segment_text))
return cls(segment_id, None, None, segment_text, clusters)
else: else:
# Recurse # Recurse
sub_localname = children_for_localname[localname] sub_localname = children_for_localname[localname]
@ -212,12 +268,15 @@ class ExtractedText:
) )
) )
joiner = joiner_for_textequiv_level[sub_textequiv_level] joiner = joiner_for_textequiv_level[sub_textequiv_level]
return cls(segment_id, segments, joiner, None) return cls(segment_id, segments, joiner, None, None)
@classmethod @classmethod
def from_str(cls, text, normalization=Normalization.NFC_SBB): def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization) normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization) clusters = list(grapheme_clusters(normalized_text))
return cls(
None, None, None, normalized_text, clusters, normalization=normalization
)
def invert_dict(d): def invert_dict(d):
@ -225,7 +284,7 @@ def invert_dict(d):
return {v: k for k, v in d.items()} return {v: k for k, v in d.items()}
def get_textequiv_unicode(text_segment, nsmap) -> str: def get_textequiv_unicode(text_segment: Any, nsmap: Dict[str, str]) -> str:
"""Get the TextEquiv/Unicode text of the given PAGE text element.""" """Get the TextEquiv/Unicode text of the given PAGE text element."""
segment_id = text_segment.attrib["id"] segment_id = text_segment.attrib["id"]
textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap) textequivs = text_segment.findall("./page:TextEquiv", namespaces=nsmap)
@ -249,7 +308,7 @@ def get_first_textequiv(textequivs, segment_id):
if np.any(~nan_mask): if np.any(~nan_mask):
if np.any(nan_mask): if np.any(nan_mask):
log.warning("TextEquiv without index in %s.", segment_id) log.warning("TextEquiv without index in %s.", segment_id)
index = np.nanargmin(indices) index = int(np.nanargmin(indices))
else: else:
# try ordering by conf # try ordering by conf
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float) confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
@ -258,7 +317,7 @@ def get_first_textequiv(textequivs, segment_id):
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.", "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id, segment_id,
) )
index = np.nanargmax(confidences) index = int(np.nanargmax(confidences))
else: else:
# fallback to first entry in case of neither index or conf present # fallback to first entry in case of neither index or conf present
log.warning("No index attributes, use first TextEquiv in %s.", segment_id) log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
@ -266,11 +325,11 @@ def get_first_textequiv(textequivs, segment_id):
return textequivs[index] return textequivs[index]
def get_attr(te, attr_name) -> float: def get_attr(te: Any, attr_name: str) -> float:
"""Extract the attribute for the given name. """Extract the attribute for the given name.
Note: currently only handles numeric values! Note: currently only handles numeric values!
Other or non existend values are encoded as np.nan. Other or non existent values are encoded as np.nan.
""" """
attr_value = te.attrib.get(attr_name) attr_value = te.attrib.get(attr_name)
try: try:

View file

@ -22,7 +22,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz." "dinglehopper used to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz."
] ]
}, },
{ {
@ -391,7 +391,7 @@
"\\text{CER} = \\frac{i + s + d}{n}\n", "\\text{CER} = \\frac{i + s + d}{n}\n",
"$$\n", "$$\n",
"\n", "\n",
"where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)" "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)"
] ]
}, },
{ {
@ -680,7 +680,7 @@
" return cat in unwanted_categories or subcat in unwanted_subcategories\n", " return cat in unwanted_categories or subcat in unwanted_subcategories\n",
"\n", "\n",
" # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n", " # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
" # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n", " # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n",
" for word in uniseg.wordbreak.words(s):\n", " for word in uniseg.wordbreak.words(s):\n",
" if all(unwanted(c) for c in word):\n", " if all(unwanted(c) for c in word):\n",
" pass\n", " pass\n",

View file

@ -1,44 +1,56 @@
from __future__ import division, print_function
import os import os
import sys import sys
from typing import Iterator from typing import Dict, Iterator, Optional
import chardet import chardet
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb from .extracted_text import ExtractedText, normalize_sbb
log = getLogger("processor.OcrdDinglehopperEvaluate")
def alto_namespace(tree: ET.ElementTree) -> str:
def alto_namespace(tree: ET._ElementTree) -> Optional[str]:
"""Return the ALTO namespace used in the given ElementTree. """Return the ALTO namespace used in the given ElementTree.
This relies on the assumption that, in any given ALTO file, the root element has the This relies on the assumption that, in any given ALTO file, the root element has the
local name "alto". We do not check if the files uses any valid ALTO namespace. local name "alto". We do not check if the file uses any valid ALTO namespace.
""" """
root_name = ET.QName(tree.getroot().tag) root_name = ET.QName(tree.getroot().tag)
if root_name.localname == "alto": if root_name.localname == "alto":
assert isinstance(root_name.namespace, str)
return root_name.namespace return root_name.namespace
else: else:
raise ValueError("Not an ALTO tree") raise ValueError("Not an ALTO tree")
def alto_extract_lines(tree: ET.ElementTree) -> Iterator[ExtractedText]: def alto_nsmap(tree: ET._ElementTree) -> Dict[str, str]:
nsmap = {"alto": alto_namespace(tree)} alto_ns = alto_namespace(tree)
if alto_ns is None:
raise ValueError("Could not determine ALTO namespace")
return {"alto": alto_ns}
def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]:
nsmap = alto_nsmap(tree)
for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap):
line_id = line.attrib.get("ID") line_id = line.attrib.get("ID")
line_text = " ".join( line_text = " ".join(
string.attrib.get("CONTENT") string.attrib.get("CONTENT", "")
for string in line.iterfind("alto:String", namespaces=nsmap) for string in line.iterfind("alto:String", namespaces=nsmap)
) )
yield ExtractedText(line_id, None, None, normalize_sbb(line_text)) normalized_text = normalize_sbb(line_text)
clusters = list(grapheme_clusters(normalized_text))
yield ExtractedText(line_id, None, None, normalized_text, clusters)
# FIXME hardcoded SBB normalization # FIXME hardcoded SBB normalization
def alto_extract(tree: ET.ElementTree) -> ExtractedText: def alto_extract(tree: ET._ElementTree) -> ExtractedText:
"""Extract text from the given ALTO ElementTree.""" """Extract text from the given ALTO ElementTree."""
return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None) return ExtractedText(None, list(alto_extract_lines(tree)), "\n", None, None)
def alto_text(tree): def alto_text(tree):
@ -87,7 +99,7 @@ def page_extract(tree, *, textequiv_level="region"):
# Filter empty region texts # Filter empty region texts
regions = [r for r in regions if r.text != ""] regions = [r for r in regions if r.text != ""]
return ExtractedText(None, regions, "\n", None) return ExtractedText(None, regions, "\n", None, None)
def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level): def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
@ -97,7 +109,7 @@ def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]: if ET.QName(group.tag).localname in ["OrderedGroup", "OrderedGroupIndexed"]:
ro_children = list(group) ro_children = list(group)
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children) ro_children = [child for child in ro_children if "index" in child.attrib.keys()]
ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"])) ro_children = sorted(ro_children, key=lambda child: int(child.attrib["index"]))
elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]: elif ET.QName(group.tag).localname in ["UnorderedGroup", "UnorderedGroupIndexed"]:
ro_children = list(group) ro_children = list(group)
@ -140,33 +152,44 @@ def detect_encoding(filename):
return chardet.detect(open(filename, "rb").read(1024))["encoding"] return chardet.detect(open(filename, "rb").read(1024))["encoding"]
def plain_extract(filename, include_filename_in_id=False): def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"):
id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}"
fileencoding = detect_encoding(filename) def make_segment(no, line):
normalized_text = normalize_sbb(line)
clusters = list(grapheme_clusters(normalized_text))
return ExtractedText(
id_template.format(filename=os.path.basename(filename), no=no),
None,
None,
normalized_text,
clusters,
)
if encoding == "autodetect":
fileencoding = detect_encoding(filename)
log.warning(
f"Autodetected encoding as '{fileencoding}'"
", it is recommended to specify it explicitly with --plain-encoding"
)
else:
fileencoding = encoding
with open(filename, "r", encoding=fileencoding) as f: with open(filename, "r", encoding=fileencoding) as f:
return ExtractedText( return ExtractedText(
None, None,
[ [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())],
ExtractedText(
id_template.format(filename=os.path.basename(filename), no=no),
None,
None,
normalize_sbb(line),
)
for no, line in enumerate(f.readlines())
],
"\n", "\n",
None, None,
None,
) )
# XXX hardcoded SBB normalization # XXX hardcoded SBB normalization
def plain_text(filename): def plain_text(filename, encoding="autodetect"):
return plain_extract(filename).text return plain_extract(filename, encoding=encoding).text
def extract(filename, *, textequiv_level="region"): def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"):
"""Extract the text from the given file. """Extract the text from the given file.
Supports PAGE, ALTO and falls back to plain text. Supports PAGE, ALTO and falls back to plain text.
@ -174,7 +197,7 @@ def extract(filename, *, textequiv_level="region"):
try: try:
tree = ET.parse(filename) tree = ET.parse(filename)
except (XMLSyntaxError, UnicodeDecodeError): except (XMLSyntaxError, UnicodeDecodeError):
return plain_extract(filename) return plain_extract(filename, encoding=plain_encoding)
try: try:
return page_extract(tree, textequiv_level=textequiv_level) return page_extract(tree, textequiv_level=textequiv_level)
except ValueError: except ValueError:

View file

@ -1,17 +1,13 @@
{ {
"version": "0.9.1", "version": "0.11.0",
"git_url": "https://github.com/qurator-spk/dinglehopper", "git_url": "https://github.com/qurator-spk/dinglehopper",
"dockerhub": "ocrd/dinglehopper",
"tools": { "tools": {
"ocrd-dinglehopper": { "ocrd-dinglehopper": {
"executable": "ocrd-dinglehopper", "executable": "ocrd-dinglehopper",
"input_file_grp_cardinality": 2,
"output_file_grp_cardinality": 1,
"description": "Evaluate OCR text against ground truth with dinglehopper", "description": "Evaluate OCR text against ground truth with dinglehopper",
"input_file_grp": [
"OCR-D-GT-PAGE",
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-OCR-EVAL"
],
"categories": [ "categories": [
"Quality assurance" "Quality assurance"
], ],
@ -29,6 +25,11 @@
"enum": ["region", "line"], "enum": ["region", "line"],
"default": "region", "default": "region",
"description": "PAGE XML hierarchy level to extract the text from" "description": "PAGE XML hierarchy level to extract the text from"
},
"plain_encoding": {
"type": "string",
"default": "autodetect",
"description": "Encoding (e.g. \"utf-8\") of plain text files"
} }
} }
} }

View file

@ -1,78 +1,78 @@
import json from functools import cached_property
import os import os
from typing import Optional
import click import click
from ocrd_models import OcrdFileType
from ocrd import Processor from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id from ocrd_utils import make_file_id
from pkg_resources import resource_string
from .cli import process as cli_process from .cli import process as cli_process
OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
@click.command() @click.command()
@ocrd_cli_options @ocrd_cli_options
def ocrd_dinglehopper(*args, **kwargs): def ocrd_dinglehopper(*args, **kwargs):
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
class OcrdDinglehopperEvaluate(Processor): class OcrdDinglehopperEvaluate(Processor):
def __init__(self, *args, **kwargs):
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
def process(self): @cached_property
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") def executable(self):
assert_file_grp_cardinality(self.output_file_grp, 1) return 'ocrd-dinglehopper'
log = getLogger("processor.OcrdDinglehopperEvaluate") def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
assert self.parameter
metrics = self.parameter["metrics"] metrics = self.parameter["metrics"]
textequiv_level = self.parameter["textequiv_level"] textequiv_level = self.parameter["textequiv_level"]
gt_grp, ocr_grp = self.input_file_grp.split(",") plain_encoding = self.parameter["plain_encoding"]
input_file_tuples = self.zip_input_files(on_error="abort") # wrong number of inputs: let fail
for n, (gt_file, ocr_file) in enumerate(input_file_tuples): gt_file, ocr_file = input_files
if not gt_file or not ocr_file: # missing on either side: skip (zip_input_files already warned)
# file/page was not found in this group if not gt_file or not ocr_file:
continue return
gt_file = self.workspace.download_file(gt_file) # missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
ocr_file = self.workspace.download_file(ocr_file) if not gt_file.local_filename:
page_id = gt_file.pageId if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
return
if not ocr_file.local_filename:
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
return
log.info("INPUT FILES %i / %s%s", n, gt_file, ocr_file) page_id = gt_file.pageId
file_id = make_file_id(ocr_file, self.output_file_grp) file_id = make_file_id(ocr_file, self.output_file_grp)
report_prefix = os.path.join(self.output_file_grp, file_id) cli_process(
gt_file.local_filename,
ocr_file.local_filename,
file_id,
self.output_file_grp,
metrics=metrics,
textequiv_level=textequiv_level,
plain_encoding=plain_encoding,
)
# Process the files # Add reports to the workspace
try: for report_suffix, mimetype in [
os.mkdir(self.output_file_grp) [".html", "text/html"],
except FileExistsError: [".json", "application/json"],
pass ]:
cli_process( output_file_id = file_id + report_suffix
gt_file.local_filename, output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
ocr_file.local_filename, if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
report_prefix, raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
metrics=metrics, self.workspace.add_file(
textequiv_level=textequiv_level, file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=file_id + report_suffix,
) )
# Add reports to the workspace
for report_suffix, mimetype in [
[".html", "text/html"],
[".json", "application/json"],
]:
self.workspace.add_file(
file_id=file_id + report_suffix,
file_grp=self.output_file_grp,
page_id=page_id,
mimetype=mimetype,
local_filename=report_prefix + report_suffix,
)
if __name__ == "__main__": if __name__ == "__main__":
ocrd_dinglehopper() ocrd_dinglehopper()

View file

@ -138,17 +138,17 @@
<mets:fileSec> <mets:fileSec>
<mets:fileGrp USE="OCR-D-GT-PAGE"> <mets:fileGrp USE="OCR-D-GT-PAGE">
<mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024"> <mets:file MIMETYPE="application/xml" ID="OCR-D-GT-PAGE_00000024">
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml"/> <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-GT-PAGE/00000024.page.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file> </mets:file>
</mets:fileGrp> </mets:fileGrp>
<mets:fileGrp USE="OCR-D-OCR-CALAMARI"> <mets:fileGrp USE="OCR-D-OCR-CALAMARI">
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001"> <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-CALAMARI_0001">
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml"/> <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file> </mets:file>
</mets:fileGrp> </mets:fileGrp>
<mets:fileGrp USE="OCR-D-OCR-TESS"> <mets:fileGrp USE="OCR-D-OCR-TESS">
<mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001"> <mets:file MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-OCR-TESS_0001">
<mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml"/> <mets:FLocat xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file> </mets:file>
</mets:fileGrp> </mets:fileGrp>
</mets:fileSec> </mets:fileSec>

View file

@ -0,0 +1 @@
This is a test.

View file

@ -0,0 +1 @@
Another test.

View file

@ -0,0 +1 @@
Tis is a test.

View file

@ -0,0 +1 @@
AnÖther test.

View file

@ -0,0 +1 @@
This is a test.

View file

@ -0,0 +1 @@
Tis is a test.

View file

@ -0,0 +1 @@
Another test.

View file

@ -0,0 +1 @@
AnÖther test.

View file

@ -0,0 +1 @@
This is a test.

View file

@ -0,0 +1 @@
Another test.

View file

@ -0,0 +1 @@
Tis is a test.

View file

@ -0,0 +1 @@
AnÖther test.

View file

@ -13,12 +13,13 @@ def test_text():
test1 = ExtractedText( test1 = ExtractedText(
None, None,
[ [
ExtractedText("s0", None, None, "foo"), ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
ExtractedText("s1", None, None, "bar"), ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
ExtractedText("s2", None, None, "bazinga"), ExtractedText("s2", None, None, "bazinga", grapheme_clusters("bazinga")),
], ],
" ", " ",
None, None,
None,
) )
assert test1.text == "foo bar bazinga" assert test1.text == "foo bar bazinga"
@ -29,8 +30,20 @@ def test_text():
def test_normalization_check(): def test_normalization_check():
with pytest.raises(ValueError, match=r".*is not in NFC.*"): with pytest.raises(ValueError, match=r".*is not in NFC.*"):
ExtractedText("foo", None, None, unicodedata.normalize("NFD", "Schlyñ")) ExtractedText(
assert ExtractedText("foo", None, None, unicodedata.normalize("NFC", "Schlyñ")) "foo",
None,
None,
unicodedata.normalize("NFD", "Schlyñ"),
grapheme_clusters(unicodedata.normalize("NFD", "Schlyñ")),
)
assert ExtractedText(
"foo",
None,
None,
unicodedata.normalize("NFC", "Schlyñ"),
grapheme_clusters(unicodedata.normalize("NFC", "Schlyñ")),
)
AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id") AlignmentElement = namedtuple("AlignmentElement", "left right left_id right_id")
@ -47,25 +60,27 @@ def test_align():
test1 = ExtractedText( test1 = ExtractedText(
None, None,
[ [
ExtractedText("s0", None, None, "foo"), ExtractedText("s0", None, None, "foo", grapheme_clusters("foo")),
ExtractedText("s1", None, None, "bar"), ExtractedText("s1", None, None, "bar", grapheme_clusters("bar")),
ExtractedText("s2", None, None, "batzinga"), ExtractedText("s2", None, None, "batzinga", grapheme_clusters("batzinga")),
], ],
" ", " ",
None, None,
None,
) )
test2 = ExtractedText( test2 = ExtractedText(
None, None,
[ [
ExtractedText("x0", None, None, "foo"), ExtractedText("x0", None, None, "foo", grapheme_clusters("foo")),
ExtractedText("x1", None, None, "bar"), ExtractedText("x1", None, None, "bar", grapheme_clusters("bar")),
# extra . # extra .
ExtractedText("x2", None, None, "."), ExtractedText("x2", None, None, ".", grapheme_clusters(".")),
# deletion + different grapheme cluster, m̃ also is two Python characters # deletion + different grapheme cluster, m̃ also is two Python characters
ExtractedText("x3", None, None, "bazim̃ga"), ExtractedText("x3", None, None, "bazim̃ga", grapheme_clusters("bazim̃ga")),
], ],
" ", " ",
None, None,
None,
) )
left_pos = 0 left_pos = 0

View file

@ -1,6 +1,8 @@
import math
import pytest import pytest
from .. import align, distance, seq_align from .. import align, distance, score_hint, seq_align
from .util import unzip from .util import unzip
@ -183,3 +185,8 @@ def test_lines_similar():
# Test __eq__ (i.e. is it a substitution or a similar string?) # Test __eq__ (i.e. is it a substitution or a similar string?)
assert list(left)[0] == list(right)[0] assert list(left)[0] == list(right)[0]
def test_score_hint():
assert score_hint(0.5, 23) == 12 # int(ceil())
assert score_hint(math.inf, 12345) is None

View file

@ -21,9 +21,9 @@ def test_cli_directory(tmp_path):
os.path.join(data_dir, "directory-test", "ocr"), os.path.join(data_dir, "directory-test", "ocr"),
"report", "report",
str(tmp_path / "reports"), str(tmp_path / "reports"),
False, metrics=False,
True, differences=True,
"line", textequiv_level="line",
) )
assert os.path.exists(tmp_path / "reports/1.xml-report.json") assert os.path.exists(tmp_path / "reports/1.xml-report.json")
@ -45,9 +45,9 @@ def test_cli_fail_without_gt(tmp_path):
os.path.join(data_dir, "directory-test", "ocr"), os.path.join(data_dir, "directory-test", "ocr"),
"report", "report",
str(tmp_path / "reports"), str(tmp_path / "reports"),
False, metrics=False,
True, differences=True,
"line", textequiv_level="line",
) )
assert len(os.listdir(tmp_path / "reports")) == 2 * 2 assert len(os.listdir(tmp_path / "reports")) == 2 * 2

View file

@ -0,0 +1,61 @@
import json
import os.path
import re
import pytest
from ..cli_line_dirs import process
from .util import working_directory
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
@pytest.mark.integration
def test_cli_line_dirs_basic(tmp_path):
"""Test that the cli/process() produces a good report"""
with working_directory(tmp_path):
gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
process(gt_dir, ocr_dir, "report")
with open("report.json", "r") as jsonf:
print(jsonf.read())
with open("report.json", "r") as jsonf:
j = json.load(jsonf)
assert j["cer"] == pytest.approx(0.1071429)
assert j["wer"] == pytest.approx(0.5)
@pytest.mark.integration
def test_cli_line_dirs_basic_report_diff(tmp_path):
"""Test that the cli/process() produces a report wiff char+word diff"""
with working_directory(tmp_path):
gt_dir = os.path.join(data_dir, "line_dirs/basic/gt")
ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr")
process(gt_dir, ocr_dir, "report")
with open("report.html", "r") as htmlf:
html_report = htmlf.read()
# Counting GT lines in the diff
assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2
assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2
@pytest.mark.integration
def test_cli_line_dirs_merged(tmp_path):
"""Test that the cli/process() produces a good report"""
with working_directory(tmp_path):
gt_dir = os.path.join(data_dir, "line_dirs/merged")
ocr_dir = os.path.join(data_dir, "line_dirs/merged")
process(
gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt"
)
with open("report.json", "r") as jsonf:
print(jsonf.read())
with open("report.json", "r") as jsonf:
j = json.load(jsonf)
assert j["cer"] == pytest.approx(0.1071429)
assert j["wer"] == pytest.approx(0.5)

View file

@ -1,4 +1,5 @@
import json import json
import re
import pytest import pytest
@ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path):
with open("report.json", "r") as jsonf: with open("report.json", "r") as jsonf:
j = json.load(jsonf) j = json.load(jsonf)
assert j["cer"] == pytest.approx(float("inf")) assert j["cer"] == pytest.approx(float("inf"))
@pytest.mark.integration
def test_cli_html(tmp_path):
"""Test that the cli/process() yields complete HTML report"""
with working_directory(tmp_path):
with open("gt.txt", "w") as gtf:
gtf.write("AAAAA")
with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB")
process("gt.txt", "ocr.txt", "report")
with open("report.html", "r") as htmlf:
html_report = htmlf.read()
print(html_report)
assert re.search(r"CER: 0\.\d+", html_report)
assert re.search(r"WER: 1\.0", html_report)
assert len(re.findall("gt.*cdiff", html_report)) == 1
assert len(re.findall("gt.*wdiff", html_report)) == 1

View file

@ -0,0 +1,35 @@
from __future__ import division, print_function
import math
import pytest
from .. import character_error_rate, plain_text
from .util import working_directory
@pytest.mark.integration
@pytest.mark.parametrize(
"gt_file_content,ocr_file_content,cer_expected",
[
("", "Lorem ipsum", math.inf),
("Lorem ipsum", "", 1.0),
("\ufeff", "Lorem ipsum", math.inf),
("Lorem ipsum", "\ufeff", 1.0),
("", "", 0.0),
("\ufeff", "", 0.0),
("", "\ufeff", 0.0),
],
)
def test_empty_files(tmp_path, gt_file_content, ocr_file_content, cer_expected):
with working_directory(tmp_path):
with open("gt.txt", "w") as gtf:
gtf.write(gt_file_content)
with open("ocr.txt", "w") as ocrf:
ocrf.write(ocr_file_content)
gt_text = plain_text("gt.txt")
ocr_text = plain_text("ocr.txt")
assert character_error_rate(gt_text, ocr_text) == cer_expected

View file

@ -34,9 +34,8 @@ def test_ocrd_cli(tmp_path):
"-O", "-O",
"OCR-D-OCR-CALAMARI-EVAL", "OCR-D-OCR-CALAMARI-EVAL",
] ]
sys.argv[ # Hack to satisfy ocrd_cli_wrap_processor() check for arguments
1: sys.argv[1:] = args
] = args # XXX Hack to satisfy ocrd_cli_wrap_processor() check for arguments
result = runner.invoke(ocrd_dinglehopper, args) result = runner.invoke(ocrd_dinglehopper, args)
assert result.exit_code == 0 assert result.exit_code == 0
result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json")) result_json = list((test_workspace_dir / "OCR-D-OCR-CALAMARI-EVAL").glob("*.json"))

View file

@ -0,0 +1,71 @@
import os
from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def test_basic():
"""Test the dumb method: User gives directories and suffixes."""
pairs = list(
find_gt_and_ocr_files(
os.path.join(data_dir, "line_dirs/basic/gt"),
".gt.txt",
os.path.join(data_dir, "line_dirs/basic/ocr"),
".some-ocr.txt",
)
)
assert len(pairs) == 2
def test_basic_autodetect():
"""Test autodetect: User gives directories, suffixes are autodetected if possible"""
pairs = list(
find_gt_and_ocr_files_autodetect(
os.path.join(data_dir, "line_dirs/basic/gt"),
os.path.join(data_dir, "line_dirs/basic/ocr"),
)
)
assert len(pairs) == 2
def test_subdirs():
"""Test the dumb method: Should also work when subdirectories are involved."""
pairs = list(
find_gt_and_ocr_files(
os.path.join(data_dir, "line_dirs/subdirs/gt"),
".gt.txt",
os.path.join(data_dir, "line_dirs/subdirs/ocr"),
".some-ocr.txt",
)
)
assert len(pairs) == 2
def test_subdirs_autodetect():
"""Test the autodetect method: Should also work when subdirectories are involved."""
pairs = list(
find_gt_and_ocr_files_autodetect(
os.path.join(data_dir, "line_dirs/subdirs/gt"),
os.path.join(data_dir, "line_dirs/subdirs/ocr"),
)
)
assert len(pairs) == 2
def test_merged():
"""Test the dumb method: GT and OCR texts are in the same directories."""
pairs = list(
find_gt_and_ocr_files(
os.path.join(data_dir, "line_dirs/merged"),
".gt.txt",
os.path.join(data_dir, "line_dirs/merged"),
".some-ocr.txt",
)
)
assert len(pairs) == 2

View file

@ -177,8 +177,20 @@ def test_text():
def test_plain(tmp_path): def test_plain(tmp_path):
with working_directory(tmp_path): with working_directory(tmp_path):
with open("ocr.txt", "w") as ocrf: with open("ocr.txt", "w") as ocrf:
ocrf.write("AAAAB") ocrf.write("First, a line.\nAnd a second line.\n")
result = plain_text("ocr.txt") result = plain_text("ocr.txt")
expected = "AAAAB" expected = "First, a line.\nAnd a second line."
assert result == expected
def test_plain_BOM(tmp_path):
"""Test that plain text files with BOM are read correctly."""
BOM = "\ufeff"
with working_directory(tmp_path):
with open("ocr.txt", "w") as ocrf:
ocrf.write(BOM + "First, a line.\nAnd a second line.\n")
result = plain_text("ocr.txt")
expected = "First, a line.\nAnd a second line."
assert result == expected assert result == expected

View file

@ -1,7 +1,5 @@
from __future__ import division
import unicodedata import unicodedata
from typing import Iterable, Tuple from typing import Generator, Iterable, Tuple, TypeVar
import uniseg.wordbreak import uniseg.wordbreak
from multimethod import multimethod from multimethod import multimethod
@ -9,6 +7,8 @@ from rapidfuzz.distance import Levenshtein
from .extracted_text import ExtractedText from .extracted_text import ExtractedText
T = TypeVar("T")
# Did we patch uniseg.wordbreak.word_break already? # Did we patch uniseg.wordbreak.word_break already?
word_break_patched = False word_break_patched = False
@ -21,12 +21,17 @@ def patch_word_break():
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
""" """
old_word_break = uniseg.wordbreak.word_break old_word_break = uniseg.wordbreak.word_break
if hasattr(uniseg.wordbreak, 'Word_Break'):
aletter = uniseg.wordbreak.Word_Break.ALetter
else:
# uniseg<0.9
aletter = uniseg.wordbreak.WordBreak.ALETTER
def new_word_break(c, index=0): def new_word_break(c):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
return "ALetter" return aletter
else: else:
return old_word_break(c, index) return old_word_break(c)
uniseg.wordbreak.word_break = new_word_break uniseg.wordbreak.word_break = new_word_break
global word_break_patched global word_break_patched
@ -34,7 +39,7 @@ def patch_word_break():
@multimethod @multimethod
def words(s: str): def words(s: str) -> Generator[str, None, None]:
"""Extract words from a string""" """Extract words from a string"""
global word_break_patched global word_break_patched
@ -54,7 +59,7 @@ def words(s: str):
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
# word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
# only whitespace, punctation "or similar characters." # only whitespace, punctuation "or similar characters."
for word in uniseg.wordbreak.words(s): for word in uniseg.wordbreak.words(s):
if all(unwanted(c) for c in word): if all(unwanted(c) for c in word):
pass pass
@ -62,37 +67,37 @@ def words(s: str):
yield word yield word
@multimethod @words.register
def words(s: ExtractedText): def _(s: ExtractedText) -> Generator[str, None, None]:
return words(s.text) yield from words(s.text)
@multimethod @multimethod
def words_normalized(s: str): def words_normalized(s: str) -> Generator[str, None, None]:
return words(unicodedata.normalize("NFC", s)) yield from words(unicodedata.normalize("NFC", s))
@multimethod @words_normalized.register
def words_normalized(s: ExtractedText): def _(s: ExtractedText) -> Generator[str, None, None]:
return words_normalized(s.text) yield from words_normalized(s.text)
@multimethod @multimethod
def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]: def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
reference_seq = list(words_normalized(reference)) reference_seq = list(words_normalized(reference))
compared_seq = list(words_normalized(compared)) compared_seq = list(words_normalized(compared))
return word_error_rate_n(reference_seq, compared_seq) wer, n = word_error_rate_n(reference_seq, compared_seq)
return wer, n
@multimethod @word_error_rate_n.register
def word_error_rate_n( def _(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
reference: ExtractedText, compared: ExtractedText wer, n = word_error_rate_n(reference.text, compared.text)
) -> Tuple[float, int]: return wer, n
return word_error_rate_n(reference.text, compared.text)
@multimethod @word_error_rate_n.register
def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]: def _(reference: Iterable[T], compared: Iterable[T]) -> Tuple[float, int]:
reference_seq = list(reference) reference_seq = list(reference)
compared_seq = list(compared) compared_seq = list(compared)
@ -106,6 +111,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i
return d / n, n return d / n, n
def word_error_rate(reference, compared) -> float: def word_error_rate(reference: T, compared: T) -> float:
wer: float
wer, _ = word_error_rate_n(reference, compared) wer, _ = word_error_rate_n(reference, compared)
return wer return wer