dinglehopper/dinglehopper/edit_distance.py

from __future__ import division, print_function

import unicodedata
from functools import partial, lru_cache
from typing import Sequence, Tuple

import numpy as np
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm
from rapidfuzz.distance import Levenshtein

from .extracted_text import ExtractedText
from .config import Config


@multimethod
def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode
    normalization and grapheme clusters. This should be the correct way to compare two
    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    return Levenshtein.distance(seq1, seq2)


@multimethod
def distance(s1: ExtractedText, s2: ExtractedText):
    return distance(s1.text, s2.text)


def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
    return Levenshtein.editops(word1, word2).as_list()
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago			`from __future__ import division, print_function`

			`import unicodedata`
			`from functools import partial, lru_cache`
			`from typing import Sequence, Tuple`

			`import numpy as np`
🎨 dinglehopper: Use multimethod to handle str vs ExtractedText 4 years ago			`from multimethod import multimethod`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago			`from uniseg.graphemecluster import grapheme_clusters`
✨ dinglehopper: Show a progressbar on --progress 4 years ago			`from tqdm import tqdm`
Revert "Revert "Merge pull request #67 from maxbachmann/rapidfuzz"" This reverts commit 76bd50f1db64d4e93b53740fd5f3bbe4ff328d59. 2 years ago			`from rapidfuzz.distance import Levenshtein`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago
🧹 dinglehopper: Sanitize imports 4 years ago			`from .extracted_text import ExtractedText`
✨ dinglehopper: Show a progressbar on --progress 4 years ago			`from .config import Config`
➡️ dinglehopper: Move ExtractedText to its own file 4 years ago
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago
🎨 dinglehopper: Use multimethod to handle str vs ExtractedText 4 years ago			`@multimethod`
			`def distance(s1: str, s2: str):`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago			`"""Compute the Levenshtein edit distance between two Unicode strings`

⚡ dinglehopper: Use rapidfuzz for editops 3 years ago			`Note that this is different from levenshtein() as this function knows about Unicode`
			`normalization and grapheme clusters. This should be the correct way to compare two`
			`Unicode strings.`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago			`"""`
🎨 dinglehopper: Reformat using black 4 years ago			`seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))`
			`seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))`
Revert "Revert "Merge pull request #67 from maxbachmann/rapidfuzz"" This reverts commit 76bd50f1db64d4e93b53740fd5f3bbe4ff328d59. 2 years ago			`return Levenshtein.distance(seq1, seq2)`
🚧 dinglehopper: Support str & ExtractedText in CER and distance functions 4 years ago
🚧 dinglehopper: Guarantee NFC + rename from_text → from_str 4 years ago
🎨 dinglehopper: Use multimethod to handle str vs ExtractedText 4 years ago			`@multimethod`
			`def distance(s1: ExtractedText, s2: ExtractedText):`
			`return distance(s1.text, s2.text)`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago

			`def editops(word1, word2):`
📓 dinglehopper: Document editops() 4 years ago			`"""`
			`Return sequence of edit operations transforming one string to another.`

			`Note that this returns indices to the _grapheme clusters_, not characters!`
			`"""`
🎨 dinglehopper: Reformat using black 4 years ago			`word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))`
			`word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))`
🐛 Update editops() and seq_align() due to RapidFuzz API changes 2 years ago			`return Levenshtein.editops(word1, word2).as_list()`