🚧 dinglehopper: Guarantee NFC + rename from_text → from_str

pull/38/head
Gerber, Mike 4 years ago
parent 7843824eaf
commit a17ee2afec

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

@ -17,7 +17,7 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
""" """
if isinstance(reference, str): if isinstance(reference, str):
return character_error_rate_n( return character_error_rate_n(
ExtractedText.from_text(reference), ExtractedText.from_str(reference),
compared) compared)
d = distance(reference, compared) d = distance(reference, compared)

@ -77,14 +77,16 @@ def distance(s1, s2):
clusters. This should be the correct way to compare two Unicode strings. clusters. This should be the correct way to compare two Unicode strings.
""" """
if isinstance(s1, ExtractedText): # XXX Implicit normalization
s1 = s1.text if isinstance(s1, str):
if isinstance(s2, ExtractedText): s1 = ExtractedText.from_str(s1)
s2 = s2.text if isinstance(s2, str):
s2 = ExtractedText.from_str(s2)
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) # s1 and s2 are now guaranteed (by ExtractedText) to be in NFC
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
return levenshtein(s1, s2) seq1 = list(grapheme_clusters(s1.text))
seq2 = list(grapheme_clusters(s2.text))
return levenshtein(seq1, seq2)
def seq_editops(seq1, seq2): def seq_editops(seq1, seq2):

@ -23,6 +23,12 @@ class Normalization(enum.Enum):
@attr.s(frozen=True) @attr.s(frozen=True)
class ExtractedText: class ExtractedText:
"""
Extracted text
Objects of this class are guaranteed to be a. always in their normalization and
b. in NFC.
"""
segment_id = attr.ib(type=Optional[str]) segment_id = attr.ib(type=Optional[str])
@segment_id.validator @segment_id.validator
@ -48,6 +54,8 @@ class ExtractedText:
@_text.validator @_text.validator
def check(self, _, value): def check(self, _, value):
if value is not None and unicodedata.normalize('NFC', value) != value:
raise ValueError('String "{}" is not in NFC.'.format(value))
if value is not None and normalize(value, self.normalization) != value: if value is not None and normalize(value, self.normalization) != value:
raise ValueError('String "{}" is not normalized.'.format(value)) raise ValueError('String "{}" is not normalized.'.format(value))
@ -93,9 +101,9 @@ class ExtractedText:
return cls(segment_id, None, None, segment_text) return cls(segment_id, None, None, segment_text)
@classmethod @classmethod
def from_text(cls, text): def from_str(cls, text, normalization=Normalization.NFC_SBB):
return cls(None, None, None, text) normalized_text = normalize(text, normalization)
return cls(None, None, None, normalized_text, normalization=normalization)
def normalize(text, normalization): def normalize(text, normalization):
@ -138,7 +146,7 @@ def alto_extract(tree):
return ExtractedText( return ExtractedText(
None, None,
(ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines), (ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines),
'\n', '\n',
None None
) )

@ -20,7 +20,7 @@ def test_text():
def test_normalization_check(): def test_normalization_check():
with pytest.raises(ValueError, match=r'.*is not normalized.*'): with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ')) ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ')) assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))

Loading…
Cancel
Save