🚧 dinglehopper: Support str & ExtractedText in CER and distance functions

pull/38/head
Gerber, Mike 4 years ago
parent 5bee55c896
commit 7843824eaf

@ -6,6 +6,7 @@ from typing import Tuple
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from qurator.dinglehopper.edit_distance import distance from qurator.dinglehopper.edit_distance import distance
from qurator.dinglehopper.ocr_files import ExtractedText
def character_error_rate_n(reference, compared) -> Tuple[float, int]: def character_error_rate_n(reference, compared) -> Tuple[float, int]:
@ -14,12 +15,13 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
:return: character error rate and length of the reference :return: character error rate and length of the reference
""" """
if isinstance(reference, str):
return character_error_rate_n(
ExtractedText.from_text(reference),
compared)
d = distance(reference, compared) d = distance(reference, compared)
# XXX n = len(list(grapheme_clusters(reference.text)))
from .cli import ExtractedText
if isinstance(reference, ExtractedText):
reference = reference.text
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
if d == 0: if d == 0:
return 0, n return 0, n

@ -7,6 +7,7 @@ from typing import Sequence, Tuple
import numpy as np import numpy as np
from uniseg.graphemecluster import grapheme_clusters from uniseg.graphemecluster import grapheme_clusters
from .ocr_files import ExtractedText
def levenshtein_matrix(seq1: Sequence, seq2: Sequence): def levenshtein_matrix(seq1: Sequence, seq2: Sequence):
"""Compute the matrix commonly computed to produce the Levenshtein distance. """Compute the matrix commonly computed to produce the Levenshtein distance.
@ -75,12 +76,12 @@ def distance(s1, s2):
Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
clusters. This should be the correct way to compare two Unicode strings. clusters. This should be the correct way to compare two Unicode strings.
""" """
# XXX
from .cli import ExtractedText
if isinstance(s1, ExtractedText): if isinstance(s1, ExtractedText):
s1 = s1.text s1 = s1.text
if isinstance(s2, ExtractedText): if isinstance(s2, ExtractedText):
s2 = s2.text s2 = s2.text
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
return levenshtein(s1, s2) return levenshtein(s1, s2)

Loading…
Cancel
Save