replace uniseg with uniseg2

pull/72/head
Max Bachmann 2 years ago
parent 22c3817f45
commit a1f0a5e2d3

@ -2,7 +2,7 @@ import unicodedata
from typing import Tuple
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from uniseg2.graphemecluster import grapheme_clusters
from .edit_distance import distance
from .extracted_text import ExtractedText

@ -175,6 +175,24 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
"""
import cProfile
import pstats
import io
import atexit
#print("Profiling...")
#pr = cProfile.Profile()
#pr.enable()
def exit():
pr.disable()
print("Profiling completed")
s = io.StringIO()
pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats()
print(s.getvalue())
#atexit.register(exit)
initLogging()
Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)

@ -1,7 +1,7 @@
import unicodedata
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from uniseg2.graphemecluster import grapheme_clusters
from rapidfuzz.distance import Levenshtein
from .extracted_text import ExtractedText

@ -9,7 +9,7 @@ import attr
import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters
from uniseg2.graphemecluster import grapheme_clusters
class Normalization(enum.Enum):

@ -4,7 +4,7 @@ from typing import Iterator
from lxml import etree as ET
from lxml.etree import XMLSyntaxError
from uniseg.graphemecluster import grapheme_clusters
from uniseg2.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb

@ -2,24 +2,24 @@ import unicodedata
from typing import Tuple, Iterable
from multimethod import multimethod
import uniseg.wordbreak
import uniseg2.wordbreak
from rapidfuzz.distance import Levenshtein
from . import ExtractedText
# Did we patch uniseg.wordbreak.word_break already?
# Did we patch uniseg2.wordbreak.word_break already?
word_break_patched = False
def patch_word_break():
"""
Patch uniseg.wordbreak.word_break to deal with our private use characters.
Patch uniseg2.wordbreak.word_break to deal with our private use characters.
See also
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
"""
old_word_break = uniseg.wordbreak.word_break
old_word_break = uniseg2.wordbreak.word_break
def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
@ -27,7 +27,7 @@ def patch_word_break():
else:
return old_word_break(c, index)
uniseg.wordbreak.word_break = new_word_break
uniseg2.wordbreak.word_break = new_word_break
global word_break_patched
word_break_patched = True
@ -53,8 +53,8 @@ def words(s: str):
return cat in unwanted_categories or subcat in unwanted_subcategories
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
for word in uniseg.wordbreak.words(s):
# uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
for word in uniseg2.wordbreak.words(s):
if all(unwanted(c) for c in word):
pass
else:

@ -1,7 +1,7 @@
click
jinja2
lxml
uniseg
uniseg2
numpy
colorama
MarkupSafe

Loading…
Cancel
Save