diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py index 3b8c0cc..68accae 100644 --- a/qurator/dinglehopper/character_error_rate.py +++ b/qurator/dinglehopper/character_error_rate.py @@ -2,7 +2,7 @@ import unicodedata from typing import Tuple from multimethod import multimethod -from uniseg.graphemecluster import grapheme_clusters +from uniseg2.graphemecluster import grapheme_clusters from .edit_distance import distance from .extracted_text import ExtractedText diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 3c52c5d..7b74b78 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -175,6 +175,24 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. """ + import cProfile + import pstats + import io + import atexit + + #print("Profiling...") + #pr = cProfile.Profile() + #pr.enable() + + def exit(): + pr.disable() + print("Profiling completed") + s = io.StringIO() + pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats() + print(s.getvalue()) + + #atexit.register(exit) + initLogging() Config.progress = progress process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 2120b80..d89eb8c 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -1,7 +1,7 @@ import unicodedata from multimethod import multimethod -from uniseg.graphemecluster import grapheme_clusters +from uniseg2.graphemecluster import grapheme_clusters from rapidfuzz.distance import Levenshtein from .extracted_text import ExtractedText diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 19ad9c1..ebb9631 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -9,7 +9,7 @@ import attr import numpy as np from lxml import etree as ET from ocrd_utils import getLogger -from uniseg.graphemecluster import grapheme_clusters +from uniseg2.graphemecluster import grapheme_clusters class Normalization(enum.Enum): diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 6384dfa..29101cd 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -4,7 +4,7 @@ from typing import Iterator from lxml import etree as ET from lxml.etree import XMLSyntaxError -from uniseg.graphemecluster import grapheme_clusters +from uniseg2.graphemecluster import grapheme_clusters from .extracted_text import ExtractedText, normalize_sbb diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 3b9ff5e..ccfc64a 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -2,24 +2,24 @@ import unicodedata from typing import Tuple, Iterable from multimethod import multimethod -import uniseg.wordbreak +import uniseg2.wordbreak from rapidfuzz.distance import Levenshtein from . import ExtractedText -# Did we patch uniseg.wordbreak.word_break already? +# Did we patch uniseg2.wordbreak.word_break already? word_break_patched = False def patch_word_break(): """ - Patch uniseg.wordbreak.word_break to deal with our private use characters. + Patch uniseg2.wordbreak.word_break to deal with our private use characters. See also https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt """ - old_word_break = uniseg.wordbreak.word_break + old_word_break = uniseg2.wordbreak.word_break def new_word_break(c, index=0): if 0xE000 <= ord(c) <= 0xF8FF: # Private Use Area @@ -27,7 +27,7 @@ def patch_word_break(): else: return old_word_break(c, index) - uniseg.wordbreak.word_break = new_word_break + uniseg2.wordbreak.word_break = new_word_break global word_break_patched word_break_patched = True @@ -53,8 +53,8 @@ def words(s: str): return cat in unwanted_categories or subcat in unwanted_subcategories # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using - # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." - for word in uniseg.wordbreak.words(s): + # uniseg2.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." + for word in uniseg2.wordbreak.words(s): if all(unwanted(c) for c in word): pass else: diff --git a/requirements.txt b/requirements.txt index daf2b0f..3c7a257 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ click jinja2 lxml -uniseg +uniseg2 numpy colorama MarkupSafe