🐛 dinglehopper: Patch word_break only once
continuous-integration/drone/push Build encountered an error Details

Previously, we (accidently) patched uniseg's word_break on every call
to words(). Do it only once.
pull/66/head
Gerber, Mike 3 years ago
parent b6bde2b7ec
commit 8a3f5e48c2

@ -10,12 +10,17 @@ from rapidfuzz.string_metric import levenshtein
from . import ExtractedText from . import ExtractedText
@multimethod # Did we patch uniseg.wordbreak.word_break already?
def words(s: str): word_break_patched = False
"""Extract words from a string"""
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt def patch_word_break():
"""
Patch uniseg.wordbreak.word_break to deal with our private use characters.
See also
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
"""
old_word_break = uniseg.wordbreak.word_break old_word_break = uniseg.wordbreak.word_break
def new_word_break(c, index=0): def new_word_break(c, index=0):
@ -25,6 +30,18 @@ def words(s: str):
return old_word_break(c, index) return old_word_break(c, index)
uniseg.wordbreak.word_break = new_word_break uniseg.wordbreak.word_break = new_word_break
global word_break_patched
word_break_patched = True
@multimethod
def words(s: str):
"""Extract words from a string"""
global word_break_patched
if not word_break_patched:
patch_word_break()
# Check if c is an unwanted character, i.e. whitespace, punctuation, or similar # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
def unwanted(c): def unwanted(c):

Loading…
Cancel
Save