dinglehopper/qurator/dinglehopper/character_error_rate.py

from __future__ import division

import unicodedata
from typing import Tuple

from uniseg.graphemecluster import grapheme_clusters

from qurator.dinglehopper.edit_distance import distance


def character_error_rate_n(reference, compared) -> Tuple[float, int]:
    """
    Compute character error rate.

    :return: character error rate and length of the reference
    """
    d = distance(reference, compared)
    # XXX
    from .cli import ExtractedText
    if isinstance(reference, ExtractedText):
        reference = reference.text
    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))

    if d == 0:
        return 0, n
    if n == 0:
        return float('inf'), n
    return d/n, n

    # XXX Should we really count newlines here?


def character_error_rate(reference, compared) -> float:
    """
    Compute character error rate.

    :return: character error rate
    """
    cer, _ = character_error_rate_n(reference, compared)
    return cer
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago			`from __future__ import division`

			`import unicodedata`
✨ dinglehopper: Include number of characters and words in JSON report 5 years ago			`from typing import Tuple`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago
			`from uniseg.graphemecluster import grapheme_clusters`

			`from qurator.dinglehopper.edit_distance import distance`


✨ dinglehopper: Include number of characters and words in JSON report 5 years ago			`def character_error_rate_n(reference, compared) -> Tuple[float, int]:`
			`"""`
			`Compute character error rate.`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago
✨ dinglehopper: Include number of characters and words in JSON report 5 years ago			`:return: character error rate and length of the reference`
			`"""`
			`d = distance(reference, compared)`
🚧 dinglehopper: Display segment id when hovering over a character difference 4 years ago			`# XXX`
			`from .cli import ExtractedText`
			`if isinstance(reference, ExtractedText):`
			`reference = reference.text`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago			`n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))`

✨ dinglehopper: Include number of characters and words in JSON report 5 years ago			`if d == 0:`
			`return 0, n`
			`if n == 0:`
			`return float('inf'), n`
			`return d/n, n`
Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector" This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340. 5 years ago
			`# XXX Should we really count newlines here?`
✨ dinglehopper: Include number of characters and words in JSON report 5 years ago

			`def character_error_rate(reference, compared) -> float:`
			`"""`
			`Compute character error rate.`

			`:return: character error rate`
			`"""`
			`cer, _ = character_error_rate_n(reference, compared)`
			`return cer`