From 15dfbac3a73a9f6b6d0a7f7e96b6d12cf5032ff6 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 17 Aug 2022 11:42:19 +0200 Subject: [PATCH] Revert "Revert "Merge pull request #67 from maxbachmann/rapidfuzz"" This reverts commit 76bd50f1db64d4e93b53740fd5f3bbe4ff328d59. --- qurator/dinglehopper/align.py | 4 ++-- qurator/dinglehopper/edit_distance.py | 6 +++--- qurator/dinglehopper/notebooks/Levenshtein.ipynb | 4 ++-- qurator/dinglehopper/word_error_rate.py | 4 ++-- requirements.txt | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index 45c4835..861b8a6 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -1,5 +1,5 @@ from .edit_distance import * -from rapidfuzz.string_metric import levenshtein_editops +from rapidfuzz.distance import Levenshtein def align(t1, t2): @@ -13,7 +13,7 @@ def seq_align(s1, s2): """Align general sequences.""" s1 = list(s1) s2 = list(s2) - ops = levenshtein_editops(s1, s2) + ops = Levenshtein.editops(s1, s2) i = 0 j = 0 diff --git a/qurator/dinglehopper/edit_distance.py b/qurator/dinglehopper/edit_distance.py index 7fa4ae1..b50f006 100644 --- a/qurator/dinglehopper/edit_distance.py +++ b/qurator/dinglehopper/edit_distance.py @@ -8,7 +8,7 @@ import numpy as np from multimethod import multimethod from uniseg.graphemecluster import grapheme_clusters from tqdm import tqdm -from rapidfuzz.string_metric import levenshtein, levenshtein_editops +from rapidfuzz.distance import Levenshtein from .extracted_text import ExtractedText from .config import Config @@ -24,7 +24,7 @@ def distance(s1: str, s2: str): """ seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1))) seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2))) - return levenshtein(seq1, seq2) + return Levenshtein.distance(seq1, seq2) @multimethod @@ -40,4 +40,4 @@ def editops(word1, word2): """ word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1))) word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2))) - return levenshtein_editops(word1, word2) + return Levenshtein.editops(word1, word2) diff --git a/qurator/dinglehopper/notebooks/Levenshtein.ipynb b/qurator/dinglehopper/notebooks/Levenshtein.ipynb index 8761994..a27dca4 100644 --- a/qurator/dinglehopper/notebooks/Levenshtein.ipynb +++ b/qurator/dinglehopper/notebooks/Levenshtein.ipynb @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "from rapidfuzz.string_metric import levenshtein" + "from rapidfuzz.distance.Levenshtein import distance as levenshtein" ] }, { @@ -227,7 +227,7 @@ } ], "source": [ - "from rapidfuzz.string_metric import levenshtein_editops as editops\n", + "from rapidfuzz.distance.Levenshtein import editops\n", "\n", "editops('Foo', 'Fon')" ] diff --git a/qurator/dinglehopper/word_error_rate.py b/qurator/dinglehopper/word_error_rate.py index 0eb94a7..8f0cc96 100644 --- a/qurator/dinglehopper/word_error_rate.py +++ b/qurator/dinglehopper/word_error_rate.py @@ -6,7 +6,7 @@ from multimethod import multimethod import uniseg.wordbreak -from rapidfuzz.string_metric import levenshtein +from rapidfuzz.distance import Levenshtein from . import ExtractedText @@ -98,7 +98,7 @@ def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, i reference_seq = list(reference) compared_seq = list(compared) - d = levenshtein(reference_seq, compared_seq) + d = Levenshtein.distance(reference_seq, compared_seq) n = len(reference_seq) if d == 0: diff --git a/requirements.txt b/requirements.txt index 25e8676..daf2b0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,5 @@ ocrd >= 2.20.1 attrs multimethod == 1.3 # latest version to officially support Python 3.5 tqdm -rapidfuzz >= 2.0.5 +rapidfuzz >= 2.4.2 six # XXX workaround OCR-D/core#730