mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🚧 dinglehopper: Support str & ExtractedText in CER and distance functions
This commit is contained in:
parent
5bee55c896
commit
7843824eaf
2 changed files with 10 additions and 7 deletions
|
@ -6,6 +6,7 @@ from typing import Tuple
|
|||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from qurator.dinglehopper.edit_distance import distance
|
||||
from qurator.dinglehopper.ocr_files import ExtractedText
|
||||
|
||||
|
||||
def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||
|
@ -14,12 +15,13 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
|||
|
||||
:return: character error rate and length of the reference
|
||||
"""
|
||||
if isinstance(reference, str):
|
||||
return character_error_rate_n(
|
||||
ExtractedText.from_text(reference),
|
||||
compared)
|
||||
|
||||
d = distance(reference, compared)
|
||||
# XXX
|
||||
from .cli import ExtractedText
|
||||
if isinstance(reference, ExtractedText):
|
||||
reference = reference.text
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||
n = len(list(grapheme_clusters(reference.text)))
|
||||
|
||||
if d == 0:
|
||||
return 0, n
|
||||
|
|
|
@ -7,6 +7,7 @@ from typing import Sequence, Tuple
|
|||
import numpy as np
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from .ocr_files import ExtractedText
|
||||
|
||||
def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
|
||||
"""Compute the matrix commonly computed to produce the Levenshtein distance.
|
||||
|
@ -75,12 +76,12 @@ def distance(s1, s2):
|
|||
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
|
||||
clusters. This should be the correct way to compare two Unicode strings.
|
||||
"""
|
||||
# XXX
|
||||
from .cli import ExtractedText
|
||||
|
||||
if isinstance(s1, ExtractedText):
|
||||
s1 = s1.text
|
||||
if isinstance(s2, ExtractedText):
|
||||
s2 = s2.text
|
||||
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
||||
return levenshtein(s1, s2)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue