From f48e305347ff9ae9fec3641d8bd4101562dda3a3 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Wed, 12 Oct 2022 18:52:58 +0200 Subject: [PATCH] use uniseg again --- qurator/dinglehopper/character_error_rate.py | 2 +- qurator/dinglehopper/edit_distance.py | 2 +- qurator/dinglehopper/extracted_text.py | 2 +- qurator/dinglehopper/ocr_files.py | 2 +- qurator/dinglehopper/word_error_rate.py | 14 +++++++------- requirements.txt | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 68accae..3b8c0cc 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -2,7 +2,7 @@ import unicodedata from typing import Tuple from multimethod import multimethod -from uniseg2.graphemecluster import grapheme_clusters +from uniseg.graphemecluster import grapheme_clusters from .edit_distance import distance from .extracted_text import ExtractedText diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index d89eb8c..2120b80 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -1,7 +1,7 @@ import unicodedata from multimethod import multimethod -from uniseg2.graphemecluster import grapheme_clusters +from uniseg.graphemecluster import grapheme_clusters from rapidfuzz.distance import Levenshtein from .extracted_text import ExtractedText diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index ebb9631..19ad9c1 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -9,7 +9,7 @@ import attr import numpy as np from lxml import etree as ET from ocrd_utils import getLogger -from uniseg2.graphemecluster import grapheme_clusters +from uniseg.graphemecluster import grapheme_clusters class Normalization(enum.Enum): diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 29101cd..6384dfa 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -4,7 +4,7 @@ from typing import Iterator from lxml import etree as ET from lxml.etree import XMLSyntaxError -from uniseg2.graphemecluster import grapheme_clusters +from uniseg.graphemecluster import grapheme_clusters from .extracted_text import ExtractedText, normalize_sbb diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index ccfc64a..3b9ff5e 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -2,24 +2,24 @@ import unicodedata from typing import Tuple, Iterable from multimethod import multimethod -import uniseg2.wordbreak +import uniseg.wordbreak from rapidfuzz.distance import Levenshtein from . import ExtractedText -# Did we patch uniseg2.wordbreak.word_break already? +# Did we patch uniseg.wordbreak.word_break already? word_break_patched = False def patch_word_break(): """ - Patch uniseg2.wordbreak.word_break to deal with our private use characters. + Patch uniseg.wordbreak.word_break to deal with our private use characters. See also https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt """ - old_word_break = uniseg2.wordbreak.word_break + old_word_break = uniseg.wordbreak.word_break def new_word_break(c, index=0): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area @@ -27,7 +27,7 @@ def patch_word_break(): else: return old_word_break(c, index) - uniseg2.wordbreak.word_break = new_word_break + uniseg.wordbreak.word_break = new_word_break global word_break_patched word_break_patched = True @@ -53,8 +53,8 @@ def words(s: str): return cat in unwanted_categories or subcat in unwanted_subcategories # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using - # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." - for word in uniseg2.wordbreak.words(s): + # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." + for word in uniseg.wordbreak.words(s): if all(unwanted(c) for c in word): pass else: diff --git a/requirements.txt b/requirements.txt index 0389f61..11d1dcf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ click jinja2 lxml -uniseg2 +uniseg numpy colorama MarkupSafe