🐛 dinglehopper: Work on NFC'ed grapheme clusters when aligning text

2026-03-15 19:52:00 +01:00 · 2019-09-30 18:17:13 +02:00 · 2019-09-30 18:17:13 +02:00 · 8d055e7b6e
commit 8d055e7b6e
parent 534958be1d
3 changed files with 23 additions and 4 deletions
--- a/qurator/dinglehopper/align.py
+++ b/qurator/dinglehopper/align.py
@ -1,6 +1,15 @@
 from .edit_distance import *

-def align(s1, s2):
+
+def align(t1, t2):
+    """Align text."""
+    s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1)))
+    s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2)))
+    return seq_align(s1, s2)
+
+
+def seq_align(s1, s2):
+    """Align general sequences."""
    s1 = list(s1)
    s2 = list(s2)
    ops = seq_editops(s1, s2)
--- a/qurator/dinglehopper/cli.py
+++ b/qurator/dinglehopper/cli.py
@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader
 from qurator.dinglehopper import *


-def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none):
+def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align):
    gtx = ''
    ocrx = ''

@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix):
    wer = word_error_rate(gt_text, ocr_text)
    uwer = unordered_word_error_rate(gt_text, ocr_text)

-    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·')
+    char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align)

    gt_words = words(gt_text)
    ocr_words = words(ocr_text)
-    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯')
+    word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align)

    env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates')))
    for report_suffix in ('.html', '.json'):
--- a/qurator/dinglehopper/tests/test_editops.py
+++ b/qurator/dinglehopper/tests/test_editops.py
@ -1,3 +1,5 @@
+import unicodedata
+
 from .. import seq_editops, editops


@ -36,3 +38,11 @@ def test_editops():
    # In these cases, one of the words has a composed form, the other one does not.
    assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)]
    assert editops('oͤde', 'öde') == [('replace', 0, 0)]
+
+
+def test_editops_canonically_equivalent():
+    left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE')
+    right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE')
+    assert left != right
+    assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right)
+    assert editops(left, right) == []