dinglehopper/qurator/dinglehopper/edit_distance.py

import unicodedata

from multimethod import multimethod
from uniseg2.graphemecluster import grapheme_clusters
from rapidfuzz.distance import Levenshtein

from .extracted_text import ExtractedText


@multimethod
def distance(seq1: list[str], seq2: list[str]):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode
    normalization and grapheme clusters. This should be the correct way to compare two
    Unicode strings.
    """
    return Levenshtein.distance(seq1, seq2)


@multimethod
def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode
    normalization and grapheme clusters. This should be the correct way to compare two
    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    return Levenshtein.distance(seq1, seq2)


@multimethod
def distance(s1: ExtractedText, s2: ExtractedText):
    return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)


def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
    return Levenshtein.editops(word1, word2).as_list()
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 2019-12-09 12:44:05 +01:00			`import unicodedata`

🎨 dinglehopper: Use multimethod to handle str vs ExtractedText 2020-10-08 12:15:58 +02:00			`from multimethod import multimethod`
replace uniseg with uniseg2 2022-08-29 22:08:25 +02:00			`from uniseg2.graphemecluster import grapheme_clusters`
Revert "Revert "Merge pull request #67 from maxbachmann/rapidfuzz"" This reverts commit 76bd50f1db64d4e93b53740fd5f3bbe4ff328d59. 2022-08-17 11:42:19 +02:00			`from rapidfuzz.distance import Levenshtein`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 2019-12-09 12:44:05 +01:00
🧹 dinglehopper: Sanitize imports 2020-10-08 13:33:19 +02:00			`from .extracted_text import ExtractedText`
➡️ dinglehopper: Move ExtractedText to its own file 2020-10-08 13:25:20 +02:00
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 2019-12-09 12:44:05 +01:00
move grapheme clusters to ExtractedText 2022-08-29 01:49:04 +02:00			`@multimethod`
			`def distance(seq1: list[str], seq2: list[str]):`
			`"""Compute the Levenshtein edit distance between two Unicode strings`

			`Note that this is different from levenshtein() as this function knows about Unicode`
			`normalization and grapheme clusters. This should be the correct way to compare two`
			`Unicode strings.`
			`"""`
			`return Levenshtein.distance(seq1, seq2)`

apply black 2022-08-29 01:50:19 +02:00
🎨 dinglehopper: Use multimethod to handle str vs ExtractedText 2020-10-08 12:15:58 +02:00			`@multimethod`
			`def distance(s1: str, s2: str):`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 2019-12-09 12:44:05 +01:00			`"""Compute the Levenshtein edit distance between two Unicode strings`

⚡ dinglehopper: Use rapidfuzz for editops 2021-10-22 15:38:59 +02:00			`Note that this is different from levenshtein() as this function knows about Unicode`
			`normalization and grapheme clusters. This should be the correct way to compare two`
			`Unicode strings.`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 2019-12-09 12:44:05 +01:00			`"""`
🎨 dinglehopper: Reformat using black 2020-11-10 12:29:55 +01:00			`seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))`
			`seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))`
Revert "Revert "Merge pull request #67 from maxbachmann/rapidfuzz"" This reverts commit 76bd50f1db64d4e93b53740fd5f3bbe4ff328d59. 2022-08-17 11:42:19 +02:00			`return Levenshtein.distance(seq1, seq2)`
🚧 dinglehopper: Support str & ExtractedText in CER and distance functions 2020-10-08 10:47:20 +02:00
🚧 dinglehopper: Guarantee NFC + rename from_text → from_str 2020-10-08 11:25:01 +02:00
🎨 dinglehopper: Use multimethod to handle str vs ExtractedText 2020-10-08 12:15:58 +02:00			`@multimethod`
			`def distance(s1: ExtractedText, s2: ExtractedText):`
move grapheme clusters to ExtractedText 2022-08-29 01:49:04 +02:00			`return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 2019-12-09 12:44:05 +01:00

			`def editops(word1, word2):`
📓 dinglehopper: Document editops() 2020-06-12 17:01:28 +02:00			`"""`
			`Return sequence of edit operations transforming one string to another.`

			`Note that this returns indices to the _grapheme clusters_, not characters!`
			`"""`
🎨 dinglehopper: Reformat using black 2020-11-10 12:29:55 +01:00			`word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))`
			`word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))`
🐛 Update editops() and seq_align() due to RapidFuzz API changes 2022-08-17 17:55:44 +02:00			`return Levenshtein.editops(word1, word2).as_list()`