mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-08 03:10:30 +02:00
Added comments
This commit is contained in:
parent
cee7b6891b
commit
40f23b8482
5 changed files with 56 additions and 8 deletions
|
@ -9,6 +9,16 @@ from .utils import bag_accuracy, MetricResult, Weights
|
|||
def bag_of_chars_accuracy(
|
||||
reference: str, compared: str, weights: Weights = Weights(1, 0, 1)
|
||||
) -> MetricResult:
|
||||
"""Compute Bag of Chars accuracy and error rate.
|
||||
|
||||
We are using sets to calculate the errors.
|
||||
See :func:`bag_accuracy` for details.
|
||||
|
||||
:param reference: String used as reference (e.g. ground truth).
|
||||
:param compared: String that gets evaluated (e.g. ocr result).
|
||||
:param weights: Weights/costs for editing operations.
|
||||
:return: Class representing the results of this metric.
|
||||
"""
|
||||
reference_chars: Counter = Counter(grapheme_clusters(normalize("NFC", reference)))
|
||||
compared_chars: Counter = Counter(grapheme_clusters(normalize("NFC", compared)))
|
||||
return bag_accuracy(
|
||||
|
|
|
@ -7,6 +7,16 @@ from ..normalize import words_normalized
|
|||
def bag_of_words_accuracy(
|
||||
reference: str, compared: str, weights: Weights = Weights(1, 0, 1)
|
||||
) -> MetricResult:
|
||||
"""Compute Bag of Words accuracy and error rate.
|
||||
|
||||
We are using sets to calculate the errors.
|
||||
See :func:`bag_accuracy` for details.
|
||||
|
||||
:param reference: String used as reference (e.g. ground truth).
|
||||
:param compared: String that gets evaluated (e.g. ocr result).
|
||||
:param weights: Weights/costs for editing operations.
|
||||
:return: Class representing the results of this metric.
|
||||
"""
|
||||
reference_words: Counter = Counter(words_normalized(reference))
|
||||
compared_words: Counter = Counter(words_normalized(compared))
|
||||
return bag_accuracy(
|
||||
|
|
|
@ -8,9 +8,15 @@ def character_accuracy(
|
|||
) -> MetricResult:
|
||||
"""Compute character accuracy and error rate.
|
||||
|
||||
:return: NamedTuple representing the results of this metric.
|
||||
"""
|
||||
We are using the Levenshtein distance between reference and compared.
|
||||
|
||||
:param reference: String used as reference (e.g. ground truth).
|
||||
:param compared: String that gets evaluated (e.g. ocr result).
|
||||
:param weights: Weights/costs for editing operations (not supported yet).
|
||||
:return: Class representing the results of this metric.
|
||||
"""
|
||||
if weights != Weights(1, 1, 1):
|
||||
raise NotImplementedError("Setting weights is not supported yet.")
|
||||
weighted_errors = distance(reference, compared)
|
||||
n_ref = len(chars_normalized(reference))
|
||||
n_cmp = len(chars_normalized(compared))
|
||||
|
|
|
@ -6,25 +6,35 @@ class Weights(NamedTuple):
|
|||
"""Represent weights/costs for editing operations."""
|
||||
|
||||
deletes: int = 1
|
||||
"""Cost for a delete ad -> a."""
|
||||
inserts: int = 1
|
||||
"""Cost for an insert a -> ai."""
|
||||
replacements: int = 1
|
||||
"""Cost for a replacement ab -> ar."""
|
||||
|
||||
|
||||
class MetricResult(NamedTuple):
|
||||
"""Represent a result from a metric calculation."""
|
||||
|
||||
metric: str
|
||||
"""Name of the metric that calculated this results."""
|
||||
weights: Weights
|
||||
"""The `Weights`/costs used to calculate this result."""
|
||||
weighted_errors: int
|
||||
"""The weighted errors calculated by the metric."""
|
||||
reference_elements: int
|
||||
"""Number of elements in the reference string."""
|
||||
compared_elements: int
|
||||
"""Number of elements in the string that is evaluated."""
|
||||
|
||||
@property
|
||||
def accuracy(self) -> float:
|
||||
"""The accuracy calculated as 1 - errors / reference_elements."""
|
||||
return 1 - self.error_rate
|
||||
|
||||
@property
|
||||
def error_rate(self) -> float:
|
||||
"""The error rate calculated by errors / reference_elements."""
|
||||
if self.reference_elements <= 0 and self.compared_elements <= 0:
|
||||
return 0
|
||||
elif self.reference_elements <= 0:
|
||||
|
@ -34,7 +44,7 @@ class MetricResult(NamedTuple):
|
|||
def get_dict(self) -> Dict:
|
||||
"""Combines the properties to a dictionary.
|
||||
|
||||
We deviate from the builtin _asdict() function by including our properties.
|
||||
We deviate from the builtin `_asdict()` function by including our properties.
|
||||
"""
|
||||
return {
|
||||
**{key: value for key, value in self._asdict().items()},
|
||||
|
@ -50,19 +60,19 @@ def bag_accuracy(
|
|||
weights: Weights,
|
||||
metric: str = "bag_accuracy",
|
||||
) -> MetricResult:
|
||||
"""Calculates the the weighted errors for two bags (Counter).
|
||||
"""Calculates the the weighted errors for two bags (Multiset, `Counter`).
|
||||
|
||||
Basic algorithm idea:
|
||||
- All elements in reference not occurring in compared are considered deletes.
|
||||
- All elements in compared not occurring in reference are considered inserts.
|
||||
- All elements in `reference` not occurring in `compared` are considered deletes.
|
||||
- All elements in `compared` not occurring in `reference` are considered inserts.
|
||||
- When the cost for one replacement is lower than that of one insert and one delete
|
||||
we can substitute pairs of deletes and inserts with one replacement.
|
||||
|
||||
:param reference: Bag used as reference (ground truth).
|
||||
:param compared: Bag used to compare (ocr).
|
||||
:param weights: Weights/costs for editing operations.
|
||||
:param weights: `Weights`/costs for editing operations.
|
||||
:param metric: Name of the (original) metric.
|
||||
:return: NamedTuple representing the results of this metric.
|
||||
:return: `NamedTuple` representing the results of this metric.
|
||||
"""
|
||||
n_ref = sum(reference.values())
|
||||
n_cmp = sum(compared.values())
|
||||
|
|
|
@ -6,6 +6,18 @@ from ..normalize import words_normalized
|
|||
def word_accuracy(
|
||||
reference: str, compared: str, weights: Weights = Weights(1, 1, 1)
|
||||
) -> MetricResult:
|
||||
"""Compute word accuracy and error rate.
|
||||
|
||||
We are using the Levenshtein distance between reference.
|
||||
|
||||
:param reference: String used as reference (e.g. ground truth).
|
||||
:param compared: String that gets evaluated (e.g. ocr result).
|
||||
:param weights: Weights/costs for editing operations (not supported yet).
|
||||
:return: Class representing the results of this metric.
|
||||
"""
|
||||
if weights != Weights(1, 1, 1):
|
||||
raise NotImplementedError("Setting weights is not supported yet.")
|
||||
|
||||
reference_seq = list(words_normalized(reference))
|
||||
compared_seq = list(words_normalized(compared))
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue