mirror of
				https://github.com/qurator-spk/page2tsv.git
				synced 2025-10-31 16:44:13 +01:00 
			
		
		
		
	store OCR or NED confidences in tsv file
This commit is contained in:
		
							parent
							
								
									5d55ba24a3
								
							
						
					
					
						commit
						900015da61
					
				
					 2 changed files with 13 additions and 4 deletions
				
			
		|  | @ -75,9 +75,9 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | ||||||
|              ned_threshold, min_confidence, max_confidence): |              ned_threshold, min_confidence, max_confidence): | ||||||
| 
 | 
 | ||||||
|     if purpose == "NERD": |     if purpose == "NERD": | ||||||
|         out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom'] |         out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] | ||||||
|     elif purpose == "OCR": |     elif purpose == "OCR": | ||||||
|         out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom'] |         out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] | ||||||
| 
 | 
 | ||||||
|         if min_confidence is not None and max_confidence is not None: |         if min_confidence is not None and max_confidence is not None: | ||||||
|             out_columns += ['ocrconf'] |             out_columns += ['ocrconf'] | ||||||
|  | @ -168,6 +168,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | ||||||
|         tsv['NE-TAG'] = 'O' |         tsv['NE-TAG'] = 'O' | ||||||
|         tsv['NE-EMB'] = 'O' |         tsv['NE-EMB'] = 'O' | ||||||
|         tsv['ID'] = '-' |         tsv['ID'] = '-' | ||||||
|  |         tsv['conf'] = '-' | ||||||
| 
 | 
 | ||||||
|         tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) |         tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) | ||||||
|     elif purpose == 'OCR': |     elif purpose == 'OCR': | ||||||
|  |  | ||||||
|  | @ -32,6 +32,7 @@ def ned(tsv, ner_result, ned_rest_endpoint, json_file=None, threshold=None): | ||||||
|     entity = "" |     entity = "" | ||||||
|     entity_type = None |     entity_type = None | ||||||
|     tsv['ID'] = '-' |     tsv['ID'] = '-' | ||||||
|  |     tsv['conf'] = '-' | ||||||
| 
 | 
 | ||||||
|     def check_entity(tag): |     def check_entity(tag): | ||||||
|         nonlocal entity, entity_type, rids |         nonlocal entity, entity_type, rids | ||||||
|  | @ -44,19 +45,26 @@ def ned(tsv, ner_result, ned_rest_endpoint, json_file=None, threshold=None): | ||||||
|                 if 'ranking' in ned_result[eid]: |                 if 'ranking' in ned_result[eid]: | ||||||
|                     ranking = ned_result[eid]['ranking'] |                     ranking = ned_result[eid]['ranking'] | ||||||
| 
 | 
 | ||||||
|                     #tsv.loc[rids, 'ID'] = ranking[0][1]['wikidata'] if threshold is None or ranking[0][1]['proba_1'] >= threshold else '' |                     # tsv.loc[rids, 'ID'] = ranking[0][1]['wikidata'] | ||||||
|  |                     # if threshold is None or ranking[0][1]['proba_1'] >= threshold else '' | ||||||
| 
 | 
 | ||||||
|                     tmp = "|".join([ranking[i][1]['wikidata'] |                     tmp = "|".join([ranking[i][1]['wikidata'] | ||||||
|                                     for i in range(len(ranking)) |                                     for i in range(len(ranking)) | ||||||
|                                     if threshold is None or ranking[i][1]['proba_1'] >= threshold]) |                                     if threshold is None or ranking[i][1]['proba_1'] >= threshold]) | ||||||
|                     tsv.loc[rids, 'ID'] = tmp if len(tmp) > 0 else '-' |                     tsv.loc[rids, 'ID'] = tmp if len(tmp) > 0 else '-' | ||||||
| 
 | 
 | ||||||
|  |                     tmp = ",".join([str(ranking[i][1]['proba_1']) | ||||||
|  |                                     for i in range(len(ranking)) | ||||||
|  |                                     if threshold is None or ranking[i][1]['proba_1'] >= threshold]) | ||||||
|  | 
 | ||||||
|  |                     tsv.loc[rids, 'conf'] = tmp if len(tmp) > 0 else '-' | ||||||
|  | 
 | ||||||
|             rids = [] |             rids = [] | ||||||
|             entity = "" |             entity = "" | ||||||
|             entity_type = None |             entity_type = None | ||||||
| 
 | 
 | ||||||
|     ner_tmp = tsv.copy() |     ner_tmp = tsv.copy() | ||||||
|     ner_tmp.loc[~ner_tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC','B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O' |     ner_tmp.loc[~ner_tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC', 'B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O' | ||||||
| 
 | 
 | ||||||
|     for rid, row in ner_tmp.iterrows(): |     for rid, row in ner_tmp.iterrows(): | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue