From d2bbc8a6c7d150d5bdd95cc139112eacf963dc78 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 11 Sep 2022 02:38:32 +0200 Subject: [PATCH] update rapidfuzz version --- qurator/dinglehopper/align.py | 5 ++--- qurator/dinglehopper/cli.py | 27 +++++---------------------- qurator/dinglehopper/cli_line_dirs.py | 5 +++-- requirements.txt | 2 +- 4 files changed, 11 insertions(+), 28 deletions(-) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index cc96891..968d931 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -1,7 +1,6 @@ from .edit_distance import * from rapidfuzz.distance import Levenshtein - def align(t1, t2): """Align text.""" s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1))) @@ -9,11 +8,11 @@ def align(t1, t2): return seq_align(s1, s2) -def seq_align(s1, s2): +def seq_align(s1, s2, score_hint=None): """Align general sequences.""" s1 = list(s1) s2 = list(s2) - ops = Levenshtein.editops(s1, s2) + ops = Levenshtein.editops(s1, s2, score_hint=score_hint) i = 0 j = 0 diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 7b74b78..ef101a4 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -4,6 +4,7 @@ import click from jinja2 import Environment, FileSystemLoader from markupsafe import escape from ocrd_utils import initLogging +from math import ceil from .character_error_rate import character_error_rate_n from .word_error_rate import word_error_rate_n, words_normalized @@ -13,7 +14,7 @@ from .ocr_files import extract from .config import Config -def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, score_hint=None): gtx = "" ocrx = "" @@ -52,7 +53,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): g_pos = 0 o_pos = 0 - for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, score_hint)): css_classes = None gt_id = None ocr_id = None @@ -109,12 +110,12 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): cer, n_characters = character_error_rate_n(gt_text, ocr_text) char_diff_report = gen_diff_report( - gt_text, ocr_text, css_prefix="c", joiner="", none="·" + gt_text, ocr_text, css_prefix="c", joiner="", none="·", score_hint=int(ceil(cer * n_characters)) ) wer, n_words = word_error_rate_n(gt_words, ocr_words) word_diff_report = gen_diff_report( - gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" + gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯", score_hint=int(ceil(wer * n_words)) ) env = Environment( @@ -175,24 +176,6 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. """ - import cProfile - import pstats - import io - import atexit - - #print("Profiling...") - #pr = cProfile.Profile() - #pr.enable() - - def exit(): - pr.disable() - print("Profiling completed") - s = io.StringIO() - pstats.Stats(pr, stream=s).sort_stats("cumtime").print_stats() - print(s.getvalue()) - - #atexit.register(exit) - initLogging() Config.progress = progress process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level) diff --git a/qurator/dinglehopper/cli_line_dirs.py b/qurator/dinglehopper/cli_line_dirs.py index 3f8e3fc..06bbe39 100644 --- a/qurator/dinglehopper/cli_line_dirs.py +++ b/qurator/dinglehopper/cli_line_dirs.py @@ -4,6 +4,7 @@ import itertools import click from jinja2 import Environment, FileSystemLoader from ocrd_utils import initLogging +from math import ceil from .character_error_rate import character_error_rate_n from .word_error_rate import word_error_rate_n, words_normalized @@ -74,10 +75,10 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): # Generate diff reports char_diff_report += gen_diff_report( - gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·" + gt_text, ocr_text, css_prefix="l{0}-c".format(k), joiner="", none="·", score_hint=int(ceil(l_cer * l_n_characters)) ) word_diff_report += gen_diff_report( - gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯" + gt_words, ocr_words, css_prefix="l{0}-w".format(k), joiner=" ", none="⋯", score_hint=int(ceil(l_wer * l_n_words)) ) env = Environment( diff --git a/requirements.txt b/requirements.txt index 3c7a257..0389f61 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,5 @@ ocrd >= 2.20.1 attrs multimethod == 1.3 # latest version to officially support Python 3.5 tqdm -rapidfuzz >= 2.4.2 +rapidfuzz >= 2.7.0 six # XXX workaround OCR-D/core#730