mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
🐛 dinglehopper: Work on NFC'ed grapheme clusters when aligning text
This commit is contained in:
parent
534958be1d
commit
8d055e7b6e
3 changed files with 23 additions and 4 deletions
|
@ -1,6 +1,15 @@
|
||||||
from .edit_distance import *
|
from .edit_distance import *
|
||||||
|
|
||||||
def align(s1, s2):
|
|
||||||
|
def align(t1, t2):
|
||||||
|
"""Align text."""
|
||||||
|
s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
|
||||||
|
s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
|
||||||
|
return seq_align(s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
def seq_align(s1, s2):
|
||||||
|
"""Align general sequences."""
|
||||||
s1 = list(s1)
|
s1 = list(s1)
|
||||||
s2 = list(s2)
|
s2 = list(s2)
|
||||||
ops = seq_editops(s1, s2)
|
ops = seq_editops(s1, s2)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader
|
||||||
from qurator.dinglehopper import *
|
from qurator.dinglehopper import *
|
||||||
|
|
||||||
|
|
||||||
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
|
def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
|
||||||
gtx = ''
|
gtx = ''
|
||||||
ocrx = ''
|
ocrx = ''
|
||||||
|
|
||||||
|
@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix):
|
||||||
wer = word_error_rate(gt_text, ocr_text)
|
wer = word_error_rate(gt_text, ocr_text)
|
||||||
uwer = unordered_word_error_rate(gt_text, ocr_text)
|
uwer = unordered_word_error_rate(gt_text, ocr_text)
|
||||||
|
|
||||||
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
|
char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
|
||||||
|
|
||||||
gt_words = words(gt_text)
|
gt_words = words(gt_text)
|
||||||
ocr_words = words(ocr_text)
|
ocr_words = words(ocr_text)
|
||||||
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
|
word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
|
||||||
|
|
||||||
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
|
||||||
for report_suffix in ('.html', '.json'):
|
for report_suffix in ('.html', '.json'):
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
from .. import seq_editops, editops
|
from .. import seq_editops, editops
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,3 +38,11 @@ def test_editops():
|
||||||
# In these cases, one of the words has a composed form, the other one does not.
|
# In these cases, one of the words has a composed form, the other one does not.
|
||||||
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
|
||||||
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
assert editops('oͤde', 'öde') == [('replace', 0, 0)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_editops_canonically_equivalent():
|
||||||
|
left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
|
||||||
|
right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
|
||||||
|
assert left != right
|
||||||
|
assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
|
||||||
|
assert editops(left, right) == []
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue