🐛 dinglehopper: Work on NFC'ed grapheme clusters when aligning text

pull/3/head
Gerber, Mike 5 years ago
parent 534958be1d
commit 8d055e7b6e

@ -1,6 +1,15 @@
from .edit_distance import * from .edit_distance import *
def align(s1, s2):
def align(t1, t2):
"""Align text."""
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
return seq_align(s1, s2)
def seq_align(s1, s2):
"""Align general sequences."""
s1 = list(s1) s1 = list(s1)
s2 = list(s2) s2 = list(s2)
ops = seq_editops(s1, s2) ops = seq_editops(s1, s2)

@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader
from qurator.dinglehopper import * from qurator.dinglehopper import *
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none): def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
gtx = '' gtx = ''
ocrx = '' ocrx = ''
@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix):
wer = word_error_rate(gt_text, ocr_text) wer = word_error_rate(gt_text, ocr_text)
uwer = unordered_word_error_rate(gt_text, ocr_text) uwer = unordered_word_error_rate(gt_text, ocr_text)
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
gt_words = words(gt_text) gt_words = words(gt_text)
ocr_words = words(ocr_text) ocr_words = words(ocr_text)
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='') word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='', align=seq_align)
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
for report_suffix in ('.html', '.json'): for report_suffix in ('.html', '.json'):

@ -1,3 +1,5 @@
import unicodedata
from .. import seq_editops, editops from .. import seq_editops, editops
@ -36,3 +38,11 @@ def test_editops():
# In these cases, one of the words has a composed form, the other one does not. # In these cases, one of the words has a composed form, the other one does not.
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)] assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
assert editops('oͤde', 'öde') == [('replace', 0, 0)] assert editops('oͤde', 'öde') == [('replace', 0, 0)]
def test_editops_canonically_equivalent():
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
assert left != right
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
assert editops(left, right) == []

Loading…
Cancel
Save