mirror of
				https://github.com/qurator-spk/page2tsv.git
				synced 2025-10-26 06:04:14 +01:00 
			
		
		
		
	try to infer correct line ordering ...
This commit is contained in:
		
							parent
							
								
									e535a070c4
								
							
						
					
					
						commit
						7bf9cfa5de
					
				
					 1 changed files with 38 additions and 17 deletions
				
			
		
							
								
								
									
										37
									
								
								cli.py
									
										
									
									
									
								
							
							
						
						
									
										37
									
								
								cli.py
									
										
									
									
									
								
							|  | @ -152,6 +152,8 @@ def ner(tsv, ner_rest_endpoint): | ||||||
| @click.option('--scale-factor', type=float, default=0.5685, help='default: 0.5685') | @click.option('--scale-factor', type=float, default=0.5685, help='default: 0.5685') | ||||||
| def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, scale_factor): | def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, scale_factor): | ||||||
| 
 | 
 | ||||||
|  |     out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top', 'bottom'] | ||||||
|  | 
 | ||||||
|     if noproxy: |     if noproxy: | ||||||
|         os.environ['no_proxy'] = '*' |         os.environ['no_proxy'] = '*' | ||||||
| 
 | 
 | ||||||
|  | @ -164,17 +166,24 @@ def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, | ||||||
| 
 | 
 | ||||||
|         urls = [part['url'] for part in parts] |         urls = [part['url'] for part in parts] | ||||||
|     else: |     else: | ||||||
|         pd.DataFrame([], columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', 'url_id', 'left', 'right', 'top', |         pd.DataFrame([], columns=out_columns). to_csv(tsv_out_file, sep="\t", quoting=3, index=False) | ||||||
|                                   'bottom']). to_csv(tsv_out_file, sep="\t", quoting=3, index=False) |  | ||||||
| 
 | 
 | ||||||
|     tsv = [] |     tsv = [] | ||||||
|     for words in tree.findall('.//{%s}Word' % xmlns): |     line_number = 0 | ||||||
|  |     rgn_number = 0 | ||||||
|  |     for region in tree.findall('.//{%s}TextRegion' % xmlns): | ||||||
|  |         rgn_number += 1 | ||||||
|  |         for text_line in region.findall('.//{%s}TextLine' % xmlns): | ||||||
|  |             line_number += 1 | ||||||
|  |             for words in text_line.findall('.//{%s}Word' % xmlns): | ||||||
|                 for word in words.findall('.//{%s}Unicode' % xmlns): |                 for word in words.findall('.//{%s}Unicode' % xmlns): | ||||||
|                     text = word.text |                     text = word.text | ||||||
|                     for coords in words.findall('.//{%s}Coords' % xmlns): |                     for coords in words.findall('.//{%s}Coords' % xmlns): | ||||||
| 
 | 
 | ||||||
|                 # transform OCR coordinates using `scale_factor` to derive correct coordinates for the web presentation image |                         # transform OCR coordinates using `scale_factor` to derive | ||||||
|                 points = [int(scale_factor * float(pos)) for p in coords.attrib['points'].split(' ') for pos in p.split(',')] |                         # correct coordinates for the web presentation image | ||||||
|  |                         points = [int(scale_factor * float(pos)) | ||||||
|  |                                   for p in coords.attrib['points'].split(' ') for pos in p.split(',')] | ||||||
| 
 | 
 | ||||||
|                         x_points = [points[i] for i in range(0, len(points), 2)] |                         x_points = [points[i] for i in range(0, len(points), 2)] | ||||||
|                         y_points = [points[i] for i in range(1, len(points), 2)] |                         y_points = [points[i] for i in range(1, len(points), 2)] | ||||||
|  | @ -184,14 +193,26 @@ def page2tsv(page_xml_file, tsv_out_file, image_url, ner_rest_endpoint, noproxy, | ||||||
|                         top = min(y_points) |                         top = min(y_points) | ||||||
|                         bottom = max(y_points) |                         bottom = max(y_points) | ||||||
| 
 | 
 | ||||||
|                 tsv.append((0, text, 'O', 'O', '-', len(urls), left, right, top, bottom)) |                         tsv.append((rgn_number, line_number, left + (right-left)/2.0, | ||||||
|  |                                     0, text, 'O', 'O', '-', len(urls), left, right, top, bottom)) | ||||||
| 
 | 
 | ||||||
|     with open(tsv_out_file, 'a') as f: |     with open(tsv_out_file, 'a') as f: | ||||||
| 
 | 
 | ||||||
|         f.write('# ' + image_url + '\n') |         f.write('# ' + image_url + '\n') | ||||||
| 
 | 
 | ||||||
|     tsv = pd.DataFrame(tsv, columns=['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'GND-ID', |     tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + out_columns) | ||||||
|                                      'url_id', 'left', 'right', 'top', 'bottom']) | 
 | ||||||
|  |     vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top + | ||||||
|  |                                (tsv[['line', 'bottom']].groupby('line').mean().bottom - | ||||||
|  |                                 tsv[['line', 'top']].groupby('line').mean().top) / 2, columns=['vlinecenter']) | ||||||
|  | 
 | ||||||
|  |     tsv = tsv.merge(vlinecenter, left_on='line', right_index=True) | ||||||
|  | 
 | ||||||
|  |     regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)] | ||||||
|  | 
 | ||||||
|  |     tsv = pd.concat(regions) | ||||||
|  | 
 | ||||||
|  |     tsv = tsv[out_columns].reset_index(drop=True) | ||||||
| 
 | 
 | ||||||
|     if ner_rest_endpoint is not None: |     if ner_rest_endpoint is not None: | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue