mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-09 11:50:00 +02:00
🐛 dinglehopper: Work on NFC'ed grapheme clusters when aligning text
This commit is contained in:
parent
534958be1d
commit
8d055e7b6e
3 changed files with 23 additions and 4 deletions
|
@ -1,6 +1,15 @@
|
|||
from .edit_distance import *
|
||||
|
||||
def align(s1, s2):
|
||||
|
||||
def align(t1, t2):
|
||||
"""Align text."""
|
||||
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
|
||||
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
|
||||
return seq_align(s1, s2)
|
||||
|
||||
|
||||
def seq_align(s1, s2):
|
||||
"""Align general sequences."""
|
||||
s1 = list(s1)
|
||||
s2 = list(s2)
|
||||
ops = seq_editops(s1, s2)
|
||||
|
|
|
@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader
|
|||
from qurator.dinglehopper import *
|
||||
|
||||
|
||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
|
||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||
gtx = ''
|
||||
ocrx = ''
|
||||
|
||||
|
@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix):
|
|||
wer = word_error_rate(gt_text, ocr_text)
|
||||
uwer = unordered_word_error_rate(gt_text, ocr_text)
|
||||
|
||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
||||
|
||||
gt_words = words(gt_text)
|
||||
ocr_words = words(ocr_text)
|
||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
|
||||
|
||||
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
||||
for report_suffix in ('.html', '.json'):
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import unicodedata
|
||||
|
||||
from .. import seq_editops, editops
|
||||
|
||||
|
||||
|
@ -36,3 +38,11 @@ def test_editops():
|
|||
# In these cases, one of the words has a composed form, the other one does not.
|
||||
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
||||
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
||||
|
||||
|
||||
def test_editops_canonically_equivalent():
|
||||
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
|
||||
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
|
||||
assert left != right
|
||||
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
|
||||
assert editops(left, right) == []
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue