mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 03:40:12 +02:00
use uniseg again
This commit is contained in:
parent
d2bbc8a6c7
commit
f48e305347
6 changed files with 12 additions and 12 deletions
|
@ -2,7 +2,7 @@ import unicodedata
|
|||
from typing import Tuple
|
||||
|
||||
from multimethod import multimethod
|
||||
from uniseg2.graphemecluster import grapheme_clusters
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .edit_distance import distance
|
||||
from .extracted_text import ExtractedText
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import unicodedata
|
||||
|
||||
from multimethod import multimethod
|
||||
from uniseg2.graphemecluster import grapheme_clusters
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
from .extracted_text import ExtractedText
|
||||
|
|
|
@ -9,7 +9,7 @@ import attr
|
|||
import numpy as np
|
||||
from lxml import etree as ET
|
||||
from ocrd_utils import getLogger
|
||||
from uniseg2.graphemecluster import grapheme_clusters
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
|
||||
class Normalization(enum.Enum):
|
||||
|
|
|
@ -4,7 +4,7 @@ from typing import Iterator
|
|||
|
||||
from lxml import etree as ET
|
||||
from lxml.etree import XMLSyntaxError
|
||||
from uniseg2.graphemecluster import grapheme_clusters
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .extracted_text import ExtractedText, normalize_sbb
|
||||
|
||||
|
|
|
@ -2,24 +2,24 @@ import unicodedata
|
|||
from typing import Tuple, Iterable
|
||||
from multimethod import multimethod
|
||||
|
||||
import uniseg2.wordbreak
|
||||
import uniseg.wordbreak
|
||||
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
from . import ExtractedText
|
||||
|
||||
|
||||
# Did we patch uniseg2.wordbreak.word_break already?
|
||||
# Did we patch uniseg.wordbreak.word_break already?
|
||||
word_break_patched = False
|
||||
|
||||
|
||||
def patch_word_break():
|
||||
"""
|
||||
Patch uniseg2.wordbreak.word_break to deal with our private use characters.
|
||||
Patch uniseg.wordbreak.word_break to deal with our private use characters.
|
||||
|
||||
See also
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||
"""
|
||||
old_word_break = uniseg2.wordbreak.word_break
|
||||
old_word_break = uniseg.wordbreak.word_break
|
||||
|
||||
def new_word_break(c, index=0):
|
||||
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||
|
@ -27,7 +27,7 @@ def patch_word_break():
|
|||
else:
|
||||
return old_word_break(c, index)
|
||||
|
||||
uniseg2.wordbreak.word_break = new_word_break
|
||||
uniseg.wordbreak.word_break = new_word_break
|
||||
global word_break_patched
|
||||
word_break_patched = True
|
||||
|
||||
|
@ -53,8 +53,8 @@ def words(s: str):
|
|||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||
|
||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||
# uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||
for word in uniseg2.wordbreak.words(s):
|
||||
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||
for word in uniseg.wordbreak.words(s):
|
||||
if all(unwanted(c) for c in word):
|
||||
pass
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue