From a1f0a5e2d36969e6ffcf67256e9194b058513efe Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Mon, 29 Aug 2022 22:08:25 +0200
Subject: [PATCH] replace uniseg with uniseg2

---
 qurator/dinglehopper/character_error_rate.py |  2 +-
 qurator/dinglehopper/cli.py                  | 18 ++++++++++++++++++
 qurator/dinglehopper/edit_distance.py        |  2 +-
 qurator/dinglehopper/extracted_text.py       |  2 +-
 qurator/dinglehopper/ocr_files.py            |  2 +-
 qurator/dinglehopper/word_error_rate.py      | 14 +++++++-------
 requirements.txt                             |  2 +-
 7 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 3b8c0cc..68accae 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -2,7 +2,7 @@ import unicodedata
 from typing import Tuple
 
 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 
 from .edit_distance import distance
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
index 3c52c5d..7b74b78 100644
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@@ -175,6 +175,24 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
     By default, the text of PAGE files is extracted on 'region' level. You may
     use "--textequiv-level line" to extract from the level of TextLine tags.
     """
+    import cProfile
+    import pstats
+    import io
+    import atexit
+
+    #print("Profiling...")
+    #pr = cProfile.Profile()
+    #pr.enable()
+
+    def exit():
+        pr.disable()
+        print("Profiling completed")
+        s = io.StringIO()
+        pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats()
+        print(s.getvalue())
+
+    #atexit.register(exit)
+
     initLogging()
     Config.progress = progress
     process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index 2120b80..d89eb8c 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -1,7 +1,7 @@
 import unicodedata
 
 from multimethod import multimethod
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
 
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index 19ad9c1..ebb9631 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -9,7 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 
 
 class Normalization(enum.Enum):
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 6384dfa..29101cd 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -4,7 +4,7 @@ from typing import Iterator
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
-from uniseg.graphemecluster import grapheme_clusters
+from uniseg2.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index 3b9ff5e..ccfc64a 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -2,24 +2,24 @@ import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod
 
-import uniseg.wordbreak
+import uniseg2.wordbreak
 
 from rapidfuzz.distance import Levenshtein
 from . import ExtractedText
 
 
-# Did we patch uniseg.wordbreak.word_break already?
+# Did we patch uniseg2.wordbreak.word_break already?
 word_break_patched = False
 
 
 def patch_word_break():
     """
-    Patch uniseg.wordbreak.word_break to deal with our private use characters.
+    Patch uniseg2.wordbreak.word_break to deal with our private use characters.
 
     See also
     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
     """
-    old_word_break = uniseg.wordbreak.word_break
+    old_word_break = uniseg2.wordbreak.word_break
 
     def new_word_break(c, index=0):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
@@ -27,7 +27,7 @@ def patch_word_break():
         else:
             return old_word_break(c, index)
 
-    uniseg.wordbreak.word_break = new_word_break
+    uniseg2.wordbreak.word_break = new_word_break
     global word_break_patched
     word_break_patched = True
 
@@ -53,8 +53,8 @@ def words(s: str):
         return cat in unwanted_categories or subcat in unwanted_subcategories
 
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
-    for word in uniseg.wordbreak.words(s):
+    # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    for word in uniseg2.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass
         else:
diff --git a/requirements.txt b/requirements.txt
index daf2b0f..3c7a257 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg
+uniseg2
 numpy
 colorama
 MarkupSafe