mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
✨ dinglehopper: Include number of characters and words in JSON report
This commit is contained in:
parent
6987a8e1e2
commit
745095e52c
4 changed files with 41 additions and 18 deletions
|
@ -1,21 +1,36 @@
|
|||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
from typing import Tuple
|
||||
|
||||
from uniseg.graphemecluster import grapheme_clusters
|
||||
|
||||
from qurator.dinglehopper.edit_distance import distance
|
||||
|
||||
|
||||
def character_error_rate(reference, compared):
|
||||
def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||
"""
|
||||
Compute character error rate.
|
||||
|
||||
:return: character error rate and length of the reference
|
||||
"""
|
||||
d = distance(reference, compared)
|
||||
if d == 0:
|
||||
return 0
|
||||
|
||||
n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
|
||||
if n == 0:
|
||||
return float('inf')
|
||||
|
||||
return d/n
|
||||
if d == 0:
|
||||
return 0, n
|
||||
if n == 0:
|
||||
return float('inf'), n
|
||||
return d/n, n
|
||||
|
||||
# XXX Should we really count newlines here?
|
||||
|
||||
|
||||
def character_error_rate(reference, compared) -> float:
|
||||
"""
|
||||
Compute character error rate.
|
||||
|
||||
:return: character error rate
|
||||
"""
|
||||
cer, _ = character_error_rate_n(reference, compared)
|
||||
return cer
|
||||
|
|
|
@ -57,8 +57,8 @@ def process(gt, ocr, report_prefix):
|
|||
gt_text = substitute_equivalences(gt_text)
|
||||
ocr_text = substitute_equivalences(ocr_text)
|
||||
|
||||
cer = character_error_rate(gt_text, ocr_text)
|
||||
wer = word_error_rate(gt_text, ocr_text)
|
||||
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
|
||||
wer, n_words = word_error_rate_n(gt_text, ocr_text)
|
||||
|
||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
||||
|
||||
|
@ -88,7 +88,8 @@ def process(gt, ocr, report_prefix):
|
|||
template = env.get_template(template_fn)
|
||||
template.stream(
|
||||
gt=gt, ocr=ocr,
|
||||
cer=cer, wer=wer,
|
||||
cer=cer, n_characters=n_characters,
|
||||
wer=wer, n_words=n_words,
|
||||
char_diff_report=char_diff_report,
|
||||
word_diff_report=word_diff_report
|
||||
).dump(out_fn)
|
||||
|
|
|
@ -2,5 +2,7 @@
|
|||
"gt": "{{ gt }}",
|
||||
"ocr": "{{ ocr }}",
|
||||
"cer": {{ cer|json_float }},
|
||||
"wer": {{ wer|json_float }}
|
||||
"wer": {{ wer|json_float }},
|
||||
"n_characters": {{ n_characters }},
|
||||
"n_words": {{ n_words }}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import division
|
||||
|
||||
import unicodedata
|
||||
from typing import Tuple
|
||||
|
||||
import uniseg.wordbreak
|
||||
|
||||
|
@ -44,7 +45,7 @@ def words_normalized(s):
|
|||
return words(unicodedata.normalize('NFC', s))
|
||||
|
||||
|
||||
def word_error_rate(reference, compared):
|
||||
def word_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||
if isinstance(reference, str):
|
||||
reference_seq = list(words_normalized(reference))
|
||||
compared_seq = list(words_normalized(compared))
|
||||
|
@ -53,11 +54,15 @@ def word_error_rate(reference, compared):
|
|||
compared_seq = list(compared)
|
||||
|
||||
d = levenshtein(reference_seq, compared_seq)
|
||||
if d == 0:
|
||||
return 0
|
||||
|
||||
n = len(reference_seq)
|
||||
if n == 0:
|
||||
return float('inf')
|
||||
|
||||
return d / n
|
||||
if d == 0:
|
||||
return 0, n
|
||||
if n == 0:
|
||||
return float('inf'), n
|
||||
return d / n, n
|
||||
|
||||
|
||||
def word_error_rate(reference, compared) -> float:
|
||||
wer, _ = word_error_rate_n(reference, compared)
|
||||
return wer
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue