From 8d055e7b6e55e8c1da9ac9db2f5805804c59ab44 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Mon, 30 Sep 2019 18:17:13 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20dinglehopper:=20Work=20on=20NFC'?= =?UTF-8?q?ed=20grapheme=20clusters=20when=20aligning=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/dinglehopper/align.py | 11 ++++++++++- qurator/dinglehopper/cli.py | 6 +++--- qurator/dinglehopper/tests/test_editops.py | 10 ++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index 043db33..ab44760 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -1,6 +1,15 @@ from .edit_distance import * -def align(s1, s2): + +def align(t1, t2): + """Align text.""" + s1 = list(grapheme_clusters(unicodedata.normalize('NFC', t1))) + s2 = list(grapheme_clusters(unicodedata.normalize('NFC', t2))) + return seq_align(s1, s2) + + +def seq_align(s1, s2): + """Align general sequences.""" s1 = list(s1) s2 = list(s2) ops = seq_editops(s1, s2) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 3cdcf9a..129ce57 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -7,7 +7,7 @@ from jinja2 import Environment, FileSystemLoader from qurator.dinglehopper import * -def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none): +def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): gtx = '' ocrx = '' @@ -58,11 +58,11 @@ def process(gt, ocr, report_prefix): wer = word_error_rate(gt_text, ocr_text) uwer = unordered_word_error_rate(gt_text, ocr_text) - char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') + char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) gt_words = words(gt_text) ocr_words = words(ocr_text) - word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') + word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) for report_suffix in ('.html', '.json'): diff --git a/qurator/dinglehopper/tests/test_editops.py b/qurator/dinglehopper/tests/test_editops.py index ce22377..8fafe5d 100644 --- a/qurator/dinglehopper/tests/test_editops.py +++ b/qurator/dinglehopper/tests/test_editops.py @@ -1,3 +1,5 @@ +import unicodedata + from .. import seq_editops, editops @@ -36,3 +38,11 @@ def test_editops(): # In these cases, one of the words has a composed form, the other one does not. assert editops('Schlyñ', 'Schlym̃') == [('replace', 5, 5)] assert editops('oͤde', 'öde') == [('replace', 0, 0)] + + +def test_editops_canonically_equivalent(): + left = unicodedata.lookup('LATIN SMALL LETTER N') + unicodedata.lookup('COMBINING TILDE') + right = unicodedata.lookup('LATIN SMALL LETTER N WITH TILDE') + assert left != right + assert unicodedata.normalize('NFC', left) == unicodedata.normalize('NFC', right) + assert editops(left, right) == []