|
|
@ -1,14 +1,19 @@
|
|
|
|
from __future__ import division
|
|
|
|
from __future__ import division
|
|
|
|
|
|
|
|
|
|
|
|
import unicodedata
|
|
|
|
import unicodedata
|
|
|
|
from typing import Tuple
|
|
|
|
from typing import Tuple, Iterable
|
|
|
|
|
|
|
|
from multimethod import multimethod
|
|
|
|
|
|
|
|
|
|
|
|
import uniseg.wordbreak
|
|
|
|
import uniseg.wordbreak
|
|
|
|
|
|
|
|
|
|
|
|
from .edit_distance import levenshtein
|
|
|
|
from .edit_distance import levenshtein
|
|
|
|
|
|
|
|
from .ocr_files import ExtractedText
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def words(s):
|
|
|
|
@multimethod
|
|
|
|
|
|
|
|
def words(s: str):
|
|
|
|
|
|
|
|
"""Extract words from a string"""
|
|
|
|
|
|
|
|
|
|
|
|
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
|
|
|
# Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
|
|
|
|
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
|
|
|
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
|
|
|
|
old_word_break = uniseg.wordbreak.word_break
|
|
|
|
old_word_break = uniseg.wordbreak.word_break
|
|
|
@ -32,11 +37,6 @@ def words(s):
|
|
|
|
cat = subcat[0]
|
|
|
|
cat = subcat[0]
|
|
|
|
return cat in unwanted_categories or subcat in unwanted_subcategories
|
|
|
|
return cat in unwanted_categories or subcat in unwanted_subcategories
|
|
|
|
|
|
|
|
|
|
|
|
# XXX
|
|
|
|
|
|
|
|
from .cli import ExtractedText
|
|
|
|
|
|
|
|
if isinstance(s, ExtractedText):
|
|
|
|
|
|
|
|
s = s.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
|
|
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using
|
|
|
|
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
|
|
|
# uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters."
|
|
|
|
for word in uniseg.wordbreak.words(s):
|
|
|
|
for word in uniseg.wordbreak.words(s):
|
|
|
@ -46,27 +46,37 @@ def words(s):
|
|
|
|
yield word
|
|
|
|
yield word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def words_normalized(s):
|
|
|
|
@multimethod
|
|
|
|
# XXX
|
|
|
|
def words(s: ExtractedText):
|
|
|
|
from .cli import ExtractedText
|
|
|
|
return words(s.text)
|
|
|
|
if isinstance(s, ExtractedText):
|
|
|
|
|
|
|
|
s = s.text
|
|
|
|
|
|
|
|
|
|
|
|
@multimethod
|
|
|
|
|
|
|
|
def words_normalized(s: str):
|
|
|
|
return words(unicodedata.normalize('NFC', s))
|
|
|
|
return words(unicodedata.normalize('NFC', s))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def word_error_rate_n(reference, compared) -> Tuple[float, int]:
|
|
|
|
@multimethod
|
|
|
|
# XXX
|
|
|
|
def words_normalized(s: ExtractedText):
|
|
|
|
from .cli import ExtractedText
|
|
|
|
return words_normalized(s.text)
|
|
|
|
if isinstance(reference, ExtractedText):
|
|
|
|
|
|
|
|
reference = reference.text
|
|
|
|
|
|
|
|
if isinstance(compared, ExtractedText):
|
|
|
|
@multimethod
|
|
|
|
compared = compared.text
|
|
|
|
def word_error_rate_n(reference: str, compared: str) -> Tuple[float, int]:
|
|
|
|
if isinstance(reference, str):
|
|
|
|
reference_seq = list(words_normalized(reference))
|
|
|
|
reference_seq = list(words_normalized(reference))
|
|
|
|
compared_seq = list(words_normalized(compared))
|
|
|
|
compared_seq = list(words_normalized(compared))
|
|
|
|
return word_error_rate_n(reference_seq, compared_seq)
|
|
|
|
else:
|
|
|
|
|
|
|
|
reference_seq = list(reference)
|
|
|
|
|
|
|
|
compared_seq = list(compared)
|
|
|
|
@multimethod
|
|
|
|
|
|
|
|
def word_error_rate_n(reference: ExtractedText, compared: ExtractedText) -> Tuple[float, int]:
|
|
|
|
|
|
|
|
return word_error_rate_n(reference.text, compared.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@multimethod
|
|
|
|
|
|
|
|
def word_error_rate_n(reference: Iterable, compared: Iterable) -> Tuple[float, int]:
|
|
|
|
|
|
|
|
reference_seq = list(reference)
|
|
|
|
|
|
|
|
compared_seq = list(compared)
|
|
|
|
|
|
|
|
|
|
|
|
d = levenshtein(reference_seq, compared_seq)
|
|
|
|
d = levenshtein(reference_seq, compared_seq)
|
|
|
|
n = len(reference_seq)
|
|
|
|
n = len(reference_seq)
|
|
|
|