🐛 dinglehopper: Patch word_break only once

Previously, we (accidently) patched uniseg's word_break on every call to words(). Do it only once.
2026-03-17 04:31:58 +01:00 · 2022-01-24 18:44:30 +01:00 · 2022-01-24 18:44:30 +01:00 · 8a3f5e48c2
commit 8a3f5e48c2
parent b6bde2b7ec
1 changed files with 22 additions and 5 deletions
--- a/qurator/dinglehopper/word_error_rate.py
+++ b/qurator/dinglehopper/word_error_rate.py
@ -10,12 +10,17 @@ from rapidfuzz.string_metric import levenshtein
 from . import ExtractedText
-@multimethod
+# Did we patch uniseg.wordbreak.word_break already?
-def words(s: str):
+word_break_patched = False
    """Extract words from a string"""
-    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
+
-    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
+def patch_word_break():
    """
    Patch uniseg.wordbreak.word_break to deal with our private use characters.
    See also
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    """
    old_word_break = uniseg.wordbreak.word_break
    def new_word_break(c, index=0):
@ -25,6 +30,18 @@ def words(s: str):
            return old_word_break(c, index)
    uniseg.wordbreak.word_break = new_word_break
    global word_break_patched
    word_break_patched = True
@multimethod
 def words(s: str):
    """Extract words from a string"""
    global word_break_patched
    if not word_break_patched:
        patch_word_break()
    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
    def unwanted(c):