mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-05 16:39:59 +02:00
use uniseg again
This commit is contained in:
parent
d2bbc8a6c7
commit
f48e305347
6 changed files with 12 additions and 12 deletions
|
@ -2,7 +2,7 @@ import unicodedata
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg2.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .edit_distance import distance
|
from .edit_distance import distance
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
from uniseg2.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
from rapidfuzz.distance import Levenshtein
|
from rapidfuzz.distance import Levenshtein
|
||||||
|
|
||||||
from .extracted_text import ExtractedText
|
from .extracted_text import ExtractedText
|
||||||
|
|
|
@ -9,7 +9,7 @@ import attr
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from ocrd_utils import getLogger
|
from ocrd_utils import getLogger
|
||||||
from uniseg2.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
|
|
||||||
class Normalization(enum.Enum):
|
class Normalization(enum.Enum):
|
||||||
|
|
|
@ -4,7 +4,7 @@ from typing import Iterator
|
||||||
|
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from lxml.etree import XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
from uniseg2.graphemecluster import grapheme_clusters
|
from uniseg.graphemecluster import grapheme_clusters
|
||||||
|
|
||||||
from .extracted_text import ExtractedText, normalize_sbb
|
from .extracted_text import ExtractedText, normalize_sbb
|
||||||
|
|
||||||
|
|
|
@ -2,24 +2,24 @@ import unicodedata
|
||||||
from typing import Tuple, Iterable
|
from typing import Tuple, Iterable
|
||||||
from multimethod import multimethod
|
from multimethod import multimethod
|
||||||
|
|
||||||
import uniseg2.wordbreak
|
import uniseg.wordbreak
|
||||||
|
|
||||||
from rapidfuzz.distance import Levenshtein
|
from rapidfuzz.distance import Levenshtein
|
||||||
from . import ExtractedText
|
from . import ExtractedText
|
||||||
|
|
||||||
|
|
||||||
# Did we patch uniseg2.wordbreak.word_break already?
|
# Did we patch uniseg.wordbreak.word_break already?
|
||||||
word_break_patched = False
|
word_break_patched = False
|
||||||
|
|
||||||
|
|
||||||
def patch_word_break():
|
def patch_word_break():
|
||||||
"""
|
"""
|
||||||
Patch uniseg2.wordbreak.word_break to deal with our private use characters.
|
Patch uniseg.wordbreak.word_break to deal with our private use characters.
|
||||||
|
|
||||||
See also
|
See also
|
||||||
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
||||||
"""
|
"""
|
||||||
old_word_break = uniseg2.wordbreak.word_break
|
old_word_break = uniseg.wordbreak.word_break
|
||||||
|
|
||||||
def new_word_break(c, index=0):
|
def new_word_break(c, index=0):
|
||||||
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
|
||||||
|
@ -27,7 +27,7 @@ def patch_word_break():
|
||||||
else:
|
else:
|
||||||
return old_word_break(c, index)
|
return old_word_break(c, index)
|
||||||
|
|
||||||
uniseg2.wordbreak.word_break = new_word_break
|
uniseg.wordbreak.word_break = new_word_break
|
||||||
global word_break_patched
|
global word_break_patched
|
||||||
word_break_patched = True
|
word_break_patched = True
|
||||||
|
|
||||||
|
@ -53,8 +53,8 @@ def words(s: str):
|
||||||
return cat in unwanted_categories or subcat in unwanted_subcategories
|
return cat in unwanted_categories or subcat in unwanted_subcategories
|
||||||
|
|
||||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
||||||
# uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
||||||
for word in uniseg2.wordbreak.words(s):
|
for word in uniseg.wordbreak.words(s):
|
||||||
if all(unwanted(c) for c in word):
|
if all(unwanted(c) for c in word):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
click
|
click
|
||||||
jinja2
|
jinja2
|
||||||
lxml
|
lxml
|
||||||
uniseg2
|
uniseg
|
||||||
numpy
|
numpy
|
||||||
colorama
|
colorama
|
||||||
MarkupSafe
|
MarkupSafe
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue