diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 129ce57..5d7f43b 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -60,8 +60,8 @@ def process(gt, ocr, report_prefix): char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) - gt_words = words(gt_text) - ocr_words = words(ocr_text) + gt_words = words_normalized(gt_text) + ocr_words = words_normalized(ocr_text) word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'templates'))) diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py index 524b1ee..faa873e 100644 --- a/qurator/dinglehopper/substitute_equivalences.py +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -36,6 +36,7 @@ def substitute_equivalences(s): '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } + s = unicodedata.normalize('NFC', s) for fr, to in equivalences.items(): s = s.replace(fr, to) return s