1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-06 17:09:59 +02:00
dinglehopper/src/dinglehopper/character_error_rate.py
Mike Gerber 325e5af5f5 🐛 Move source into src/ to fix install
Installing was broken since moving to pyproject.toml, which we didn't notice because of
leftover files in build/. Fix this by using the convention of having the source files
in src/ and adjusting pyproject.toml accordingly.

Fixes gh-86. 🤞
2023-08-03 17:29:30 +02:00

47 lines
1.1 KiB
Python

from __future__ import division
import unicodedata
from typing import Tuple
from multimethod import multimethod
from uniseg.graphemecluster import grapheme_clusters
from .edit_distance import distance
from .extracted_text import ExtractedText
@multimethod
def character_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
"""
Compute character error rate.
:return: character error rate and length of the reference
"""
d = distance(reference, compared)
n = len(list(grapheme_clusters(unicodedata.normalize("NFC", reference))))
if d == 0:
return 0, n
if n == 0:
return float("inf"), n
return d / n, n
# XXX Should we really count newlines here?
@multimethod
def character_error_rate_n(
reference: ExtractedText, compared: ExtractedText
) -> Tuple[float, int]:
return character_error_rate_n(reference.text, compared.text)
def character_error_rate(reference, compared) -> float:
"""
Compute character error rate.
:return: character error rate
"""
cer, _ = character_error_rate_n(reference, compared)
return cer