use uniseg again

2026-07-26 05:29:35 +02:00 · 2022-10-12 18:52:58 +02:00 · 2022-10-12 18:52:58 +02:00 · f48e305347
commit f48e305347
parent d2bbc8a6c7
6 changed files with 12 additions and 12 deletions
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -2,7 +2,7 @@ import unicodedata
 from typing import Tuple

 from multimethod import multimethod
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters

 from .edit_distance import distance
 from .extracted_text import ExtractedText
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,7 +1,7 @@
 import unicodedata

 from multimethod import multimethod
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein

 from .extracted_text import ExtractedText
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -9,7 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters


 class Normalization(enum.Enum):
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -4,7 +4,7 @@ from typing import Iterator

 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText, normalize_sbb

--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -2,24 +2,24 @@ import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod

-import uniseg2.wordbreak
+import uniseg.wordbreak

 from rapidfuzz.distance import Levenshtein
 from . import ExtractedText


-# Did we patch uniseg2.wordbreak.word_break already?
+# Did we patch uniseg.wordbreak.word_break already?
 word_break_patched = False


 def patch_word_break():
    """
-    Patch uniseg2.wordbreak.word_break to deal with our private use characters.
+    Patch uniseg.wordbreak.word_break to deal with our private use characters.

    See also
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
-    old_word_break = uniseg2.wordbreak.word_break
+    old_word_break = uniseg.wordbreak.word_break

    def new_word_break(c, index=0):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
@ -27,7 +27,7 @@ def patch_word_break():
        else:
            return old_word_break(c, index)

-    uniseg2.wordbreak.word_break = new_word_break
+    uniseg.wordbreak.word_break = new_word_break
    global word_break_patched
    word_break_patched = True

@ -53,8 +53,8 @@ def words(s: str):
        return cat in unwanted_categories or subcat in unwanted_subcategories

    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
-    for word in uniseg2.wordbreak.words(s):
+    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    for word in uniseg.wordbreak.words(s):
        if all(unwanted(c) for c in word):
            pass
        else:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg2
+uniseg
 numpy
 colorama
 MarkupSafe