replace uniseg with uniseg2

pull/72/head
Max Bachmann 2 years ago
parent 22c3817f45
commit a1f0a5e2d3

@ -2,7 +2,7 @@ import unicodedata
from typing import Tuple from typing import Tuple
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters from uniseg2.graphemecluster import grapheme_clusters
from .edit_distance import distance from .edit_distance import distance
from .extracted_text import ExtractedText from .extracted_text import ExtractedText

@ -175,6 +175,24 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
By default, the text of PAGE files is extracted on 'region' level. You may By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags. use "--textequiv-level line" to extract from the level of TextLine tags.
""" """
import cProfile
import pstats
import io
import atexit
#print("Profiling...")
#pr = cProfile.Profile()
#pr.enable()
def exit():
pr.disable()
print("Profiling completed")
s = io.StringIO()
pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats()
print(s.getvalue())
#atexit.register(exit)
initLogging() initLogging()
Config.progress = progress Config.progress = progress
process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)

@ -1,7 +1,7 @@
import unicodedata import unicodedata
from multimethod import multimethod from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters from uniseg2.graphemecluster import grapheme_clusters
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from .extracted_text import ExtractedText from .extracted_text import ExtractedText

@ -9,7 +9,7 @@ import attr
import numpy as np import numpy as np
from lxml import etree as ET from lxml import etree as ET
from ocrd_utils import getLogger from ocrd_utils import getLogger
from uniseg.graphemecluster import grapheme_clusters from uniseg2.graphemecluster import grapheme_clusters
class Normalization(enum.Enum): class Normalization(enum.Enum):

@ -4,7 +4,7 @@ from typing import Iterator
from lxml import etree as ET from lxml import etree as ET
from lxml.etree import XMLSyntaxError from lxml.etree import XMLSyntaxError
from uniseg.graphemecluster import grapheme_clusters from uniseg2.graphemecluster import grapheme_clusters
from .extracted_text import ExtractedText, normalize_sbb from .extracted_text import ExtractedText, normalize_sbb

@ -2,24 +2,24 @@ import unicodedata
from typing import Tuple, Iterable from typing import Tuple, Iterable
from multimethod import multimethod from multimethod import multimethod
import uniseg.wordbreak import uniseg2.wordbreak
from rapidfuzz.distance import Levenshtein from rapidfuzz.distance import Levenshtein
from . import ExtractedText from . import ExtractedText
# Did we patch uniseg.wordbreak.word_break already? # Did we patch uniseg2.wordbreak.word_break already?
word_break_patched = False word_break_patched = False
def patch_word_break(): def patch_word_break():
""" """
Patch uniseg.wordbreak.word_break to deal with our private use characters. Patch uniseg2.wordbreak.word_break to deal with our private use characters.
See also See also
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
""" """
old_word_break = uniseg.wordbreak.word_break old_word_break = uniseg2.wordbreak.word_break
def new_word_break(c, index=0): def new_word_break(c, index=0):
if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area
@ -27,7 +27,7 @@ def patch_word_break():
else: else:
return old_word_break(c, index) return old_word_break(c, index)
uniseg.wordbreak.word_break = new_word_break uniseg2.wordbreak.word_break = new_word_break
global word_break_patched global word_break_patched
word_break_patched = True word_break_patched = True
@ -53,8 +53,8 @@ def words(s: str):
return cat in unwanted_categories or subcat in unwanted_subcategories return cat in unwanted_categories or subcat in unwanted_subcategories
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
for word in uniseg.wordbreak.words(s): for word in uniseg2.wordbreak.words(s):
if all(unwanted(c) for c in word): if all(unwanted(c) for c in word):
pass pass
else: else:

@ -1,7 +1,7 @@
click click
jinja2 jinja2
lxml lxml
uniseg uniseg2
numpy numpy
colorama colorama
MarkupSafe MarkupSafe

Loading…
Cancel
Save