mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-31 01:14:16 +01:00 
			
		
		
		
	🚧 dinglehopper: Display segment id when hovering over a character difference
This commit is contained in:
		
							parent
							
								
									bc1002b1e6
								
							
						
					
					
						commit
						a448133394
					
				
					 7 changed files with 97 additions and 21 deletions
				
			
		|  | @ -15,6 +15,10 @@ def character_error_rate_n(reference, compared) -> Tuple[float, int]: | ||||||
|     :return: character error rate and length of the reference |     :return: character error rate and length of the reference | ||||||
|     """ |     """ | ||||||
|     d = distance(reference, compared) |     d = distance(reference, compared) | ||||||
|  |     # XXX | ||||||
|  |     from .cli import ExtractedText | ||||||
|  |     if isinstance(reference, ExtractedText): | ||||||
|  |         reference = reference.text | ||||||
|     n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) |     n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) | ||||||
| 
 | 
 | ||||||
|     if d == 0: |     if d == 0: | ||||||
|  |  | ||||||
|  | @ -8,11 +8,11 @@ from markupsafe import escape | ||||||
| from qurator.dinglehopper import * | from qurator.dinglehopper import * | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): | def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): | ||||||
|     gtx = '' |     gtx = '' | ||||||
|     ocrx = '' |     ocrx = '' | ||||||
| 
 | 
 | ||||||
|     def format_thing(t, css_classes=None): |     def format_thing(t, css_classes=None, id_=None): | ||||||
|         if t is None: |         if t is None: | ||||||
|             html_t = none |             html_t = none | ||||||
|             css_classes += ' ellipsis' |             css_classes += ' ellipsis' | ||||||
|  | @ -21,19 +21,52 @@ def gen_diff_report(gt_things, ocr_things, css_prefix, joiner, none, align): | ||||||
|         else: |         else: | ||||||
|             html_t = escape(t) |             html_t = escape(t) | ||||||
| 
 | 
 | ||||||
|  |         html_custom_attrs = "" | ||||||
|  |         # XXX must sanitize id_ or do we trust the XML? | ||||||
|  |         if id_: | ||||||
|  |             html_custom_attrs = 'data-segment-id="{}"'.format(id_) | ||||||
|  | 
 | ||||||
|         if css_classes: |         if css_classes: | ||||||
|             return '<span class="{css_classes}">{html_t}</span>'.format(css_classes=css_classes, html_t=html_t) |             return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) | ||||||
|         else: |         else: | ||||||
|             return '{html_t}'.format(html_t=html_t) |             return '{html_t}'.format(html_t=html_t) | ||||||
| 
 | 
 | ||||||
|     for k, (g, o) in enumerate(align(gt_things, ocr_things)): |     if isinstance(gt_in, ExtractedText): | ||||||
|         if g == o: |         print(gt_in.text) | ||||||
|             css_classes = None |         if not isinstance(ocr_in, ExtractedText): | ||||||
|  |             raise TypeError() | ||||||
|  |         # XXX splitting should be done in ExtractedText | ||||||
|  |         gt_things = list(grapheme_clusters(gt_in.text)) | ||||||
|  |         ocr_things = list(grapheme_clusters(ocr_in.text)) | ||||||
|     else: |     else: | ||||||
|             css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) |         gt_things = gt_in | ||||||
|  |         ocr_things = ocr_in | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     g_pos = 0 | ||||||
|  |     o_pos = 0 | ||||||
|  |     for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): | ||||||
|  |         css_classes = None | ||||||
|  |         gt_id = None | ||||||
|  |         ocr_id = None | ||||||
|  |         if g != o: | ||||||
|  |             css_classes = '{css_prefix}diff{k} diff'.format(css_prefix=css_prefix, k=k) | ||||||
|  |             if isinstance(gt_in, ExtractedText): | ||||||
|  |                 gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None | ||||||
|  |                 ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None | ||||||
|  |                 # XXX note that deletions and inserts only produce one id + None, UI must | ||||||
|  |                 #     support this, i.e. display for the one id produced | ||||||
|  |                 # XXX otherwise, it should always display for BOTH ids | ||||||
|  | 
 | ||||||
|  |         gtx += joiner + format_thing(g, css_classes, gt_id) | ||||||
|  |         ocrx += joiner + format_thing(o, css_classes, ocr_id) | ||||||
|  | 
 | ||||||
|  |         if g is not None: | ||||||
|  |             g_pos += len(g) | ||||||
|  |         if o is not None: | ||||||
|  |             o_pos += len(o) | ||||||
| 
 | 
 | ||||||
|         gtx += joiner + format_thing(g, css_classes) |  | ||||||
|         ocrx += joiner + format_thing(o, css_classes) |  | ||||||
| 
 | 
 | ||||||
|     return \ |     return \ | ||||||
|         ''' |         ''' | ||||||
|  | @ -51,20 +84,21 @@ def process(gt, ocr, report_prefix, *, metrics=True): | ||||||
|     Click on a wrapper. |     Click on a wrapper. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     gt_text = text(gt) |     gt_text = extract(gt) | ||||||
|     ocr_text = text(ocr) |     ocr_text = extract(ocr) | ||||||
| 
 | 
 | ||||||
|     gt_text = substitute_equivalences(gt_text) |     # FIXME | ||||||
|     ocr_text = substitute_equivalences(ocr_text) |     #gt_text = substitute_equivalences(gt_text) | ||||||
|  |     #ocr_text = substitute_equivalences(ocr_text) | ||||||
| 
 | 
 | ||||||
|     cer, n_characters = character_error_rate_n(gt_text, ocr_text) |     cer, n_characters = character_error_rate_n(gt_text, ocr_text) | ||||||
|     wer, n_words = word_error_rate_n(gt_text, ocr_text) |     wer, n_words = word_error_rate_n(gt_text, ocr_text) | ||||||
| 
 | 
 | ||||||
|     char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·', align=align) |     char_diff_report = gen_diff_report(gt_text, ocr_text, css_prefix='c', joiner='', none='·') | ||||||
| 
 | 
 | ||||||
|     gt_words = words_normalized(gt_text) |     gt_words = words_normalized(gt_text) | ||||||
|     ocr_words = words_normalized(ocr_text) |     ocr_words = words_normalized(ocr_text) | ||||||
|     word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯', align=seq_align) |     word_diff_report = gen_diff_report(gt_words, ocr_words, css_prefix='w', joiner=' ', none='⋯') | ||||||
| 
 | 
 | ||||||
|     def json_float(value): |     def json_float(value): | ||||||
|         """Convert a float value to an JSON float. |         """Convert a float value to an JSON float. | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ import numpy as np | ||||||
| from uniseg.graphemecluster import grapheme_clusters | from uniseg.graphemecluster import grapheme_clusters | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def levenshtein_matrix(seq1: Sequence, seq2: Sequence): | def levenshtein_matrix(seq1: Sequence, seq2: Sequence): | ||||||
|     """Compute the matrix commonly computed to produce the Levenshtein distance. |     """Compute the matrix commonly computed to produce the Levenshtein distance. | ||||||
|     This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired |     This is also known as the Wagner-Fischer algorithm. The matrix element at the bottom right contains the desired | ||||||
|  | @ -75,6 +76,12 @@ def distance(s1, s2): | ||||||
|     Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme |     Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme | ||||||
|     clusters. This should be the correct way to compare two Unicode strings. |     clusters. This should be the correct way to compare two Unicode strings. | ||||||
|     """ |     """ | ||||||
|  |     # XXX | ||||||
|  |     from .cli import ExtractedText | ||||||
|  |     if isinstance(s1, ExtractedText): | ||||||
|  |         s1 = s1.text | ||||||
|  |     if isinstance(s2, ExtractedText): | ||||||
|  |         s2 = s2.text | ||||||
|     s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) |     s1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) | ||||||
|     s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) |     s2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) | ||||||
|     return levenshtein(s1, s2) |     return levenshtein(s1, s2) | ||||||
|  |  | ||||||
|  | @ -138,7 +138,7 @@ def page_extract(tree): | ||||||
| 
 | 
 | ||||||
|     # XXX Does a file have to have regions etc.? region vs lines etc. |     # XXX Does a file have to have regions etc.? region vs lines etc. | ||||||
|     # Filter empty region texts |     # Filter empty region texts | ||||||
|     regions = (r for r in regions if r.text is not None) |     regions = [r for r in regions if r.text is not None] | ||||||
| 
 | 
 | ||||||
|     return ExtractedText(regions, '\n') |     return ExtractedText(regions, '\n') | ||||||
|     # FIXME needs to handle normalization |     # FIXME needs to handle normalization | ||||||
|  |  | ||||||
|  | @ -26,12 +26,23 @@ | ||||||
|       border: 2px solid; |       border: 2px solid; | ||||||
|       border-radius: 5px; |       border-radius: 5px; | ||||||
|     } |     } | ||||||
|  |    #status-box { | ||||||
|  |       position: fixed; | ||||||
|  |       background: grey; | ||||||
|  |       color: white; | ||||||
|  |       width: 100%; | ||||||
|  |       height: 2em; | ||||||
|  |     } | ||||||
|     </style> |     </style> | ||||||
| </head> | </head> | ||||||
| <body> | <body> | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | <div id="status-box"> foo</div> | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| <div class="container"> | <div class="container"> | ||||||
| 
 | 
 | ||||||
| {{ gt }}<br> | {{ gt }}<br> | ||||||
|  |  | ||||||
|  | @ -4,11 +4,16 @@ function find_diff_class(classes) { | ||||||
| 
 | 
 | ||||||
| $(document).ready(function() { | $(document).ready(function() { | ||||||
|     $('.diff').mouseover(function() { |     $('.diff').mouseover(function() { | ||||||
|         let c = find_diff_class($(this).attr('class')) |         let c = find_diff_class($(this).attr('class')); | ||||||
|         $('.' + c).addClass('diff-highlight') |         $('.' + c).addClass('diff-highlight'); | ||||||
|  | 
 | ||||||
|  |         segment_id = $(this).attr('data-segment-id'); | ||||||
|  |         $('#status-box').text(segment_id); | ||||||
|     }); |     }); | ||||||
|     $('.diff').mouseout(function() { |     $('.diff').mouseout(function() { | ||||||
|         let c = find_diff_class($(this).attr('class')) |         let c = find_diff_class($(this).attr('class')); | ||||||
|         $('.' + c).removeClass('diff-highlight') |         $('.' + c).removeClass('diff-highlight'); | ||||||
|  | 
 | ||||||
|  |         $('#status-box').text(''); | ||||||
|     }); |     }); | ||||||
| }); | }); | ||||||
|  |  | ||||||
|  | @ -32,6 +32,11 @@ def words(s): | ||||||
|         cat = subcat[0] |         cat = subcat[0] | ||||||
|         return cat in unwanted_categories or subcat in unwanted_subcategories |         return cat in unwanted_categories or subcat in unwanted_subcategories | ||||||
| 
 | 
 | ||||||
|  |     # XXX | ||||||
|  |     from .cli import ExtractedText | ||||||
|  |     if isinstance(s, ExtractedText): | ||||||
|  |         s = s.text | ||||||
|  | 
 | ||||||
|     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using |     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using | ||||||
|     # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." |     # uniseg.wordbreak.words() and ignore all "words" that contain only whitespace, punctation "or similar characters." | ||||||
|     for word in uniseg.wordbreak.words(s): |     for word in uniseg.wordbreak.words(s): | ||||||
|  | @ -42,10 +47,20 @@ def words(s): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def words_normalized(s): | def words_normalized(s): | ||||||
|  |     # XXX | ||||||
|  |     from .cli import ExtractedText | ||||||
|  |     if isinstance(s, ExtractedText): | ||||||
|  |         s = s.text | ||||||
|     return words(unicodedata.normalize('NFC', s)) |     return words(unicodedata.normalize('NFC', s)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def word_error_rate_n(reference, compared) -> Tuple[float, int]: | def word_error_rate_n(reference, compared) -> Tuple[float, int]: | ||||||
|  |     # XXX | ||||||
|  |     from .cli import ExtractedText | ||||||
|  |     if isinstance(reference, ExtractedText): | ||||||
|  |         reference = reference.text | ||||||
|  |     if isinstance(compared, ExtractedText): | ||||||
|  |         compared = compared.text | ||||||
|     if isinstance(reference, str): |     if isinstance(reference, str): | ||||||
|         reference_seq = list(words_normalized(reference)) |         reference_seq = list(words_normalized(reference)) | ||||||
|         compared_seq = list(words_normalized(compared)) |         compared_seq = list(words_normalized(compared)) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue