From f48e305347ff9ae9fec3641d8bd4101562dda3a3 Mon Sep 17 00:00:00 2001
From: Max Bachmann <kontakt@maxbachmann.de>
Date: Wed, 12 Oct 2022 18:52:58 +0200
Subject: [PATCH] use uniseg again

---
 qurator/dinglehopper/character_error_rate.py |  2 +-
 qurator/dinglehopper/edit_distance.py        |  2 +-
 qurator/dinglehopper/extracted_text.py       |  2 +-
 qurator/dinglehopper/ocr_files.py            |  2 +-
 qurator/dinglehopper/word_error_rate.py      | 14 +++++++-------
 requirements.txt                             |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py
index 68accae..3b8c0cc 100644
--- a/qurator/dinglehopper/character_error_rate.py
+++ b/qurator/dinglehopper/character_error_rate.py
@@ -2,7 +2,7 @@ import unicodedata
 from typing import Tuple
 
 from multimethod import multimethod
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 from .edit_distance import distance
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py
index d89eb8c..2120b80 100644
--- a/qurator/dinglehopper/edit_distance.py
+++ b/qurator/dinglehopper/edit_distance.py
@@ -1,7 +1,7 @@
 import unicodedata
 
 from multimethod import multimethod
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 from rapidfuzz.distance import Levenshtein
 
 from .extracted_text import ExtractedText
diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
index ebb9631..19ad9c1 100644
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@@ -9,7 +9,7 @@ import attr
 import numpy as np
 from lxml import etree as ET
 from ocrd_utils import getLogger
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 
 class Normalization(enum.Enum):
diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py
index 29101cd..6384dfa 100644
--- a/qurator/dinglehopper/ocr_files.py
+++ b/qurator/dinglehopper/ocr_files.py
@@ -4,7 +4,7 @@ from typing import Iterator
 
 from lxml import etree as ET
 from lxml.etree import XMLSyntaxError
-from uniseg2.graphemecluster import grapheme_clusters
+from uniseg.graphemecluster import grapheme_clusters
 
 from .extracted_text import ExtractedText, normalize_sbb
 
diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py
index ccfc64a..3b9ff5e 100644
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@@ -2,24 +2,24 @@ import unicodedata
 from typing import Tuple, Iterable
 from multimethod import multimethod
 
-import uniseg2.wordbreak
+import uniseg.wordbreak
 
 from rapidfuzz.distance import Levenshtein
 from . import ExtractedText
 
 
-# Did we patch uniseg2.wordbreak.word_break already?
+# Did we patch uniseg.wordbreak.word_break already?
 word_break_patched = False
 
 
 def patch_word_break():
     """
-    Patch uniseg2.wordbreak.word_break to deal with our private use characters.
+    Patch uniseg.wordbreak.word_break to deal with our private use characters.
 
     See also
     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
     """
-    old_word_break = uniseg2.wordbreak.word_break
+    old_word_break = uniseg.wordbreak.word_break
 
     def new_word_break(c, index=0):
         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
@@ -27,7 +27,7 @@ def patch_word_break():
         else:
             return old_word_break(c, index)
 
-    uniseg2.wordbreak.word_break = new_word_break
+    uniseg.wordbreak.word_break = new_word_break
     global word_break_patched
     word_break_patched = True
 
@@ -53,8 +53,8 @@ def words(s: str):
         return cat in unwanted_categories or subcat in unwanted_subcategories
 
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
-    # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
-    for word in uniseg2.wordbreak.words(s):
+    # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
+    for word in uniseg.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass
         else:
diff --git a/requirements.txt b/requirements.txt
index 0389f61..11d1dcf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 click
 jinja2
 lxml
-uniseg2
+uniseg
 numpy
 colorama
 MarkupSafe