1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-03 07:29:59 +02:00

🚧 dinglehopper: Support str & ExtractedText in CER and distance functions

This commit is contained in:
Gerber, Mike 2020-10-08 10:47:20 +02:00
parent 5bee55c896
commit 7843824eaf
2 changed files with 10 additions and 7 deletions

View file

@ -7,6 +7,7 @@ from typing import Sequence, Tuple
import numpy as np
from uniseg.graphemecluster import grapheme_clusters
from .ocr_files import ExtractedText
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance.
@ -75,12 +76,12 @@ def distance(s1, s2):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings.
"""
# XXX
from .cli import ExtractedText
if isinstance(s1, ExtractedText):
s1 = s1.text
if isinstance(s2, ExtractedText):
s2 = s2.text
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
return levenshtein(s1, s2)