mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
🚧 dinglehopper: Guarantee NFC + rename from_text → from_str
This commit is contained in:
parent
7843824eaf
commit
a17ee2afec
5 changed files with 29 additions and 13 deletions
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -17,7 +17,7 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]:
|
||||||
"""
|
"""
|
||||||
if isinstance(reference, str):
|
if isinstance(reference, str):
|
||||||
return character_error_rate_n(
|
return character_error_rate_n(
|
||||||
ExtractedText.from_text(reference),
|
ExtractedText.from_str(reference),
|
||||||
compared)
|
compared)
|
||||||
|
|
||||||
d = distance(reference, compared)
|
d = distance(reference, compared)
|
||||||
|
|
|
@ -77,14 +77,16 @@ def distance(s1, s2):
|
||||||
clusters. This should be the correct way to compare two Unicode strings.
|
clusters. This should be the correct way to compare two Unicode strings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if isinstance(s1, ExtractedText):
|
# XXX Implicit normalization
|
||||||
s1 = s1.text
|
if isinstance(s1, str):
|
||||||
if isinstance(s2, ExtractedText):
|
s1 = ExtractedText.from_str(s1)
|
||||||
s2 = s2.text
|
if isinstance(s2, str):
|
||||||
|
s2 = ExtractedText.from_str(s2)
|
||||||
|
# s1 and s2 are now guaranteed (by ExtractedText) to be in NFC
|
||||||
|
|
||||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
|
seq1 = list(grapheme_clusters(s1.text))
|
||||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
|
seq2 = list(grapheme_clusters(s2.text))
|
||||||
return levenshtein(s1, s2)
|
return levenshtein(seq1, seq2)
|
||||||
|
|
||||||
|
|
||||||
def seq_editops(seq1, seq2):
|
def seq_editops(seq1, seq2):
|
||||||
|
|
|
@ -23,6 +23,12 @@ class Normalization(enum.Enum):
|
||||||
|
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
class ExtractedText:
|
class ExtractedText:
|
||||||
|
"""
|
||||||
|
Extracted text
|
||||||
|
|
||||||
|
Objects of this class are guaranteed to be a. always in their normalization and
|
||||||
|
b. in NFC.
|
||||||
|
"""
|
||||||
segment_id = attr.ib(type=Optional[str])
|
segment_id = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
@segment_id.validator
|
@segment_id.validator
|
||||||
|
@ -48,6 +54,8 @@ class ExtractedText:
|
||||||
|
|
||||||
@_text.validator
|
@_text.validator
|
||||||
def check(self, _, value):
|
def check(self, _, value):
|
||||||
|
if value is not None and unicodedata.normalize('NFC', value) != value:
|
||||||
|
raise ValueError('String "{}" is not in NFC.'.format(value))
|
||||||
if value is not None and normalize(value, self.normalization) != value:
|
if value is not None and normalize(value, self.normalization) != value:
|
||||||
raise ValueError('String "{}" is not normalized.'.format(value))
|
raise ValueError('String "{}" is not normalized.'.format(value))
|
||||||
|
|
||||||
|
@ -93,9 +101,9 @@ class ExtractedText:
|
||||||
return cls(segment_id, None, None, segment_text)
|
return cls(segment_id, None, None, segment_text)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_text(cls, text):
|
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||||
return cls(None, None, None, text)
|
normalized_text = normalize(text, normalization)
|
||||||
|
return cls(None, None, None, normalized_text, normalization=normalization)
|
||||||
|
|
||||||
|
|
||||||
def normalize(text, normalization):
|
def normalize(text, normalization):
|
||||||
|
@ -138,7 +146,7 @@ def alto_extract(tree):
|
||||||
|
|
||||||
return ExtractedText(
|
return ExtractedText(
|
||||||
None,
|
None,
|
||||||
(ExtractedText.from_text(normalize_sbb(line_text)) for line_text in lines),
|
(ExtractedText.from_str(normalize_sbb(line_text)) for line_text in lines),
|
||||||
'\n',
|
'\n',
|
||||||
None
|
None
|
||||||
)
|
)
|
||||||
|
|
|
@ -20,7 +20,7 @@ def test_text():
|
||||||
|
|
||||||
|
|
||||||
def test_normalization_check():
|
def test_normalization_check():
|
||||||
with pytest.raises(ValueError, match=r'.*is not normalized.*'):
|
with pytest.raises(ValueError, match=r'.*is not in NFC.*'):
|
||||||
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
ExtractedText('foo', None, None, unicodedata.normalize('NFD', 'Schlyñ'))
|
||||||
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
assert ExtractedText('foo', None, None, unicodedata.normalize('NFC', 'Schlyñ'))
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue