dinglehopper/qurator/dinglehopper/edit_distance.py

from __future__ import division, print_function

import unicodedata
from functools import partial, lru_cache
from typing import Sequence, Tuple

import numpy as np
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from tqdm import tqdm
from rapidfuzz.string_metric import levenshtein, levenshtein_editops

from .extracted_text import ExtractedText
from .config import Config


@multimethod
def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode
    normalization and grapheme clusters. This should be the correct way to compare two
    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    return levenshtein(seq1, seq2)


@multimethod
def distance(s1: ExtractedText, s2: ExtractedText):
    return distance(s1.text, s2.text)


def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
    return levenshtein_editops(word1, word2)