replace uniseg with uniseg2

2026-01-07 19:07:10 +01:00 · 2022-08-29 22:08:25 +02:00 · 2022-08-29 22:08:25 +02:00 · a1f0a5e2d3
commit a1f0a5e2d3
parent 22c3817f45
7 changed files with 30 additions and 12 deletions
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@ -2,7 +2,7 @@ import unicodedata
 from typing import Tuple

 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters

 from .edit_distance import distance
 from .extracted_text import ExtractedText
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -175,6 +175,24 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    By default, the text of PAGE files is extracted on 'region' level. You may
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
+    import cProfile
+    import pstats
+    import io
+    import atexit
+
+    #print("Profiling...")
+    #pr = cProfile.Profile()
+    #pr.enable()
+
+    def exit():
+        pr.disable()
+        print("Profiling completed")
+        s = io.StringIO()
+        pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats()
+        print(s.getvalue())
+
+    #atexit.register(exit)
+
    initLogging()
    Config.progress = progress
    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@ -1,7 +1,7 @@
 import unicodedata

 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein

 from .extracted_text import ExtractedText
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -9,7 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters


 class Normalization(enum.Enum):
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@ -4,7 +4,7 @@ from typing import Iterator

 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters

 from .extracted_text import ExtractedText, normalize_sbb

--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -2,24 +2,24 @@ import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod

-import uniseg.wordbreak
+import uniseg2.wordbreak

 from rapidfuzz.distance import Levenshtein
 from . import ExtractedText


-# Did we patch uniseg.wordbreak.word_break already?
+# Did we patch uniseg2.wordbreak.word_break already?
 word_break_patched = False


 def patch_word_break():
    """
-    Patch uniseg.wordbreak.word_break to deal with our private use characters.
+    Patch uniseg2.wordbreak.word_break to deal with our private use characters.

    See also
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
-    old_word_break = uniseg.wordbreak.word_break
+    old_word_break = uniseg2.wordbreak.word_break

    def new_word_break(c, index=0):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
@ -27,7 +27,7 @@ def patch_word_break():
        else:
            return old_word_break(c, index)

-    uniseg.wordbreak.word_break = new_word_break
+    uniseg2.wordbreak.word_break = new_word_break
    global word_break_patched
    word_break_patched = True

@ -53,8 +53,8 @@ def words(s: str):
        return cat in unwanted_categories or subcat in unwanted_subcategories

    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
-    for word in uniseg.wordbreak.words(s):
+    # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    for word in uniseg2.wordbreak.words(s):
        if all(unwanted(c) for c in word):
            pass
        else:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg
+uniseg2
 numpy
 colorama
 MarkupSafe