use uniseg again

pull/72/head
Max Bachmann 2 years ago committed by GitHub
parent d2bbc8a6c7
commit f48e305347
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,7 +2,7 @@ import unicodedata
from typing import Tuple from typing import Tuple
from multimethod import multimethod from multimethod import multimethod
from uniseg2.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import distance from .edit_distance import distance
from .extracted_text import ExtractedText from .extracted_text import ExtractedText

@ -1,7 +1,7 @@
import unicodedata import unicodedata
from multimethod import multimethod from multimethod import multimethod
from uniseg2.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from .extracted_text import ExtractedText from .extracted_text import ExtractedText

@ -9,7 +9,7 @@ import attr
import numpy as np import numpy as np
from lxml import etree as ET from lxml import etree as ET
from ocrd_utils import getLogger from ocrd_utils import getLogger
from uniseg2.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
class Normalization(enum.Enum): class Normalization(enum.Enum):

@ -4,7 +4,7 @@ from typing import Iterator
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from uniseg2.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb from .extracted_text import ExtractedText, normalize_sbb

@ -2,24 +2,24 @@ import unicodedata
from typing import Tuple, Iterable from typing import Tuple, Iterable
from multimethod import multimethod from multimethod import multimethod
import uniseg2.wordbreak import uniseg.wordbreak
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from . import ExtractedText from . import ExtractedText
# Did we patch uniseg2.wordbreak.word_break already? # Did we patch uniseg.wordbreak.word_break already?
word_break_patched = False word_break_patched = False
def patch_word_break(): def patch_word_break():
""" """
Patch uniseg2.wordbreak.word_break to deal with our private use characters. Patch uniseg.wordbreak.word_break to deal with our private use characters.
See also See also
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
""" """
old_word_break = uniseg2.wordbreak.word_break old_word_break = uniseg.wordbreak.word_break
def new_word_break(c, index=0): def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
@ -27,7 +27,7 @@ def patch_word_break():
else: else:
return old_word_break(c, index) return old_word_break(c, index)
uniseg2.wordbreak.word_break = new_word_break uniseg.wordbreak.word_break = new_word_break
global word_break_patched global word_break_patched
word_break_patched = True word_break_patched = True
@ -53,8 +53,8 @@ def words(s: str):
return cat in unwanted_categories or subcat in unwanted_subcategories return cat in unwanted_categories or subcat in unwanted_subcategories
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
# uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
for word in uniseg2.wordbreak.words(s): for word in uniseg.wordbreak.words(s):
if all(unwanted(c) for c in word): if all(unwanted(c) for c in word):
pass pass
else: else:

@ -1,7 +1,7 @@
click click
jinja2 jinja2
lxml lxml
uniseg2 uniseg
numpy numpy
colorama colorama
MarkupSafe MarkupSafe

Loading…
Cancel
Save