diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index ede75f4..08bb3f5 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -8,6 +8,14 @@ def align(t1, t2): return seq_align(s1, s2) +def seq_align_linewise(s1, s2, ops): + """Align two lists of lines linewise.""" + assert len(s1) == len(s2) + assert len(s2) == len(ops) + for l1, l2, line_ops in zip(s1, s2, ops): + yield from seq_align(l1, l2, ops=line_ops) + + def seq_align(s1, s2, ops=None): """Align general sequences.""" s1 = list(s1) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 46fc0b0..9a2a837 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -8,7 +8,7 @@ from uniseg.graphemecluster import grapheme_clusters from .character_error_rate import character_error_rate_n from .flexible_character_accuracy import flexible_character_accuracy, split_matches from .word_error_rate import word_error_rate_n, words_normalized -from .align import seq_align +from .align import seq_align, seq_align_linewise from .extracted_text import ExtractedText from .ocr_files import extract from .config import Config @@ -43,7 +43,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): return "{html_t}".format(html_t=html_t) ops, ocr_ids = None, None + seq_align_fun = seq_align if matches: + seq_align_fun = seq_align_linewise gt_things, ocr_things, ops = split_matches(matches) # we have to reconstruct the order of the ocr because we mixed it for fca ocr_lines = [match.ocr for match in matches] @@ -74,7 +76,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): g_pos = 0 o_pos = 0 - for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)): + for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)): css_classes = None gt_id = None ocr_id = None @@ -85,7 +87,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): # support this, i.e. display for the one id produced gt_id = gt_in.segment_id_for_pos(g_pos) if g else None if ocr_ids: - ocr_id = ocr_ids[o_pos] + ocr_id = ocr_ids.get(o_pos, None) else: ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None