mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-31 17:34:15 +01:00 
			
		
		
		
	Correct report for fca
As the fca implementation already knows the editing operations for each segment we use a different sequence alignment method.
This commit is contained in:
		
							parent
							
								
									750ad00d1b
								
							
						
					
					
						commit
						1bc7ef6c8b
					
				
					 2 changed files with 13 additions and 3 deletions
				
			
		|  | @ -8,6 +8,14 @@ def align(t1, t2): | |||
|     return seq_align(s1, s2) | ||||
| 
 | ||||
| 
 | ||||
| def seq_align_linewise(s1, s2, ops): | ||||
|     """Align two lists of lines linewise.""" | ||||
|     assert len(s1) == len(s2) | ||||
|     assert len(s2) == len(ops) | ||||
|     for l1, l2, line_ops in zip(s1, s2, ops): | ||||
|         yield from seq_align(l1, l2, ops=line_ops) | ||||
| 
 | ||||
| 
 | ||||
| def seq_align(s1, s2, ops=None): | ||||
|     """Align general sequences.""" | ||||
|     s1 = list(s1) | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ from uniseg.graphemecluster import grapheme_clusters | |||
| from .character_error_rate import character_error_rate_n | ||||
| from .flexible_character_accuracy import flexible_character_accuracy, split_matches | ||||
| from .word_error_rate import word_error_rate_n, words_normalized | ||||
| from .align import seq_align | ||||
| from .align import seq_align, seq_align_linewise | ||||
| from .extracted_text import ExtractedText | ||||
| from .ocr_files import extract | ||||
| from .config import Config | ||||
|  | @ -43,7 +43,9 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): | |||
|             return "{html_t}".format(html_t=html_t) | ||||
| 
 | ||||
|     ops, ocr_ids = None, None | ||||
|     seq_align_fun = seq_align | ||||
|     if matches: | ||||
|         seq_align_fun = seq_align_linewise | ||||
|         gt_things, ocr_things, ops = split_matches(matches) | ||||
|         # we have to reconstruct the order of the ocr because we mixed it for fca | ||||
|         ocr_lines = [match.ocr for match in matches] | ||||
|  | @ -74,7 +76,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): | |||
| 
 | ||||
|     g_pos = 0 | ||||
|     o_pos = 0 | ||||
|     for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)): | ||||
|     for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)): | ||||
|         css_classes = None | ||||
|         gt_id = None | ||||
|         ocr_id = None | ||||
|  | @ -85,7 +87,7 @@ def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): | |||
|                 # support this, i.e. display for the one id produced | ||||
|                 gt_id = gt_in.segment_id_for_pos(g_pos) if g else None | ||||
|                 if ocr_ids: | ||||
|                     ocr_id = ocr_ids[o_pos] | ||||
|                     ocr_id = ocr_ids.get(o_pos, None) | ||||
|                 else: | ||||
|                     ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue