1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-10-23 22:44:17 +02:00

🐛 dinglehopper: Work on NFC'ed grapheme clusters when aligning text

This commit is contained in:
Gerber, Mike 2019-09-30 18:17:13 +02:00
parent 534958be1d
commit 8d055e7b6e
3 changed files with 23 additions and 4 deletions

View file

@ -1,6 +1,15 @@
from .edit_distance import *
def align(s1, s2):
def align(t1, t2):
"""Align text."""
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
return seq_align(s1, s2)
def seq_align(s1, s2):
"""Align general sequences."""
s1 = list(s1)
s2 = list(s2)
ops = seq_editops(s1, s2)

View file

@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader
from qurator.dinglehopper import *
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
gtx = ''
ocrx = ''
@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix):
wer = word_error_rate(gt_text, ocr_text)
uwer = unordered_word_error_rate(gt_text, ocr_text)
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
gt_words = words(gt_text)
ocr_words = words(ocr_text)
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='')
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='', align=seq_align)
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
for report_suffix in ('.html', '.json'):

View file

@ -1,3 +1,5 @@
import unicodedata
from .. import seq_editops, editops
@ -36,3 +38,11 @@ def test_editops():
# In these cases, one of the words has a composed form, the other one does not.
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
def test_editops_canonically_equivalent():
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
assert left != right
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
assert editops(left, right) == []