Merge pull request #139 from bertsky/allow-uniseg-py38

re-allow uniseg 0.8 and py38
2026-03-16 20:22:03 +01:00 · 2025-04-22 10:09:51 +02:00 · 2025-04-22 10:09:51 +02:00 · 3d7c7ee1e3
commit 3d7c7ee1e3
parent f6dfb77f94 a24623b966
4 changed files with 9 additions and 4 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -25,7 +25,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ]
    runs-on: "ubuntu-latest"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,7 +10,7 @@ authors = [
 description = "An OCR evaluation tool"
 readme = "README.md"
 license.file = "LICENSE"
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 keywords = ["qurator", "ocr", "evaluation", "ocr-d"]
 dynamic = ["version", "dependencies", "optional-dependencies"]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg >= 0.9.1
+uniseg >= 0.8.0
 numpy
 colorama
 MarkupSafe
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@ -21,10 +21,15 @@ def patch_word_break():
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
    old_word_break = uniseg.wordbreak.word_break
    if hasattr(uniseg.wordbreak, 'Word_Break'):
        aletter = uniseg.wordbreak.Word_Break.ALetter
    else:
        # uniseg<0.9
        aletter = uniseg.wordbreak.WordBreak.ALETTER
    def new_word_break(c):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
-            return uniseg.wordbreak.Word_Break.ALetter
+            return aletter
        else:
            return old_word_break(c)