🚧 dinglehopper: Guarantee NFC + rename from_text → from_str

pull/38/head
Gerber, Mike 4 years ago
parent 7843824eaf
commit a17ee2afec

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

@ -17,7 +17,7 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
"""
if isinstance(reference, str):
return character_error_rate_n(
ExtractedText.from_text(reference),
ExtractedText.from_str(reference),
compared)
d = distance(reference, compared)

@ -77,14 +77,16 @@ def distance(s1, s2):
clusters. This should be the correct way to compare two Unicode strings.
"""
if isinstance(s1, ExtractedText):
s1 = s1.text
if isinstance(s2, ExtractedText):
s2 = s2.text
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
return levenshtein(s1, s2)
# XXX Implicit normalization
if isinstance(s1, str):
s1 = ExtractedText.from_str(s1)
if isinstance(s2, str):
s2 = ExtractedText.from_str(s2)
# s1 and s2 are now guaranteed (by ExtractedText) to be in NFC
seq1 = list(grapheme_clusters(s1.text))
seq2 = list(grapheme_clusters(s2.text))
return levenshtein(seq1, seq2)
def seq_editops(seq1, seq2):

@ -23,6 +23,12 @@ class Normalization(enum.Enum):
@attr.s(frozen=True)
class ExtractedText:
"""
Extracted text
Objects of this class are guaranteed to be a. always in their normalization and
b. in NFC.
"""
segment_id = attr.ib(type=Optional[str])
@segment_id.validator
@ -48,6 +54,8 @@ class ExtractedText:
@_text.validator
def check(self, _, value):
if value is not None and unicodedata.normalize('NFC', value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value))
@ -93,9 +101,9 @@ class ExtractedText:
return cls(segment_id, None, None, segment_text)
@classmethod
def from_text(cls, text):
return cls(None, None, None, text)
def from_str(cls, text, normalization=Normalization.NFC_SBB):
normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization)
def normalize(text, normalization):
@ -138,7 +146,7 @@ def alto_extract(tree):
return ExtractedText(
None,
(ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines),
(ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines),
'\n',
None
)

@ -20,7 +20,7 @@ def test_text():
def test_normalization_check():
with pytest.raises(ValueError, match=r'.*is not normalized.*'):
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))

Loading…
Cancel
Save