🐛 dinglehopper: Work on NFC'ed grapheme clusters when aligning text

2025-08-10 10:10:02 +02:00 · 2019-09-30 18:17:13 +02:00 · 2019-09-30 18:17:13 +02:00 · 8d055e7b6e
commit 8d055e7b6e
parent 534958be1d
3 changed files with 23 additions and 4 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,6 +1,15 @@
 from .edit_distance import *
-def align(s1, s2):
+
 def align(t1, t2):
    """Align text."""
    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
    return seq_align(s1, s2)
 def seq_align(s1, s2):
    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
    ops = seq_editops(s1, s2)
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader
 from qurator.dinglehopper import *
-def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
+def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
    gtx = ''
    ocrx = ''
@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix):
    wer = word_error_rate(gt_text, ocr_text)
    uwer = unordered_word_error_rate(gt_text, ocr_text)
-    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
+    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)
    gt_words = words(gt_text)
    ocr_words = words(ocr_text)
-    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
+    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)
    env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
    for report_suffix in ('.html', '.json'):
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,3 +1,5 @@
 import unicodedata
 from .. import seq_editops, editops
@ -36,3 +38,11 @@ def test_editops():
    # In these cases, one of the words has a composed form, the other one does not.
    assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
    assert editops('oͤde', 'öde') == [('replace', 0, 0)]
 def test_editops_canonically_equivalent():
    left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
    right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
    assert left != right
    assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
    assert editops(left, right) == []