mirror of
				https://github.com/qurator-spk/page2tsv.git
				synced 2025-10-26 06:04:14 +01:00 
			
		
		
		
	fix repeated text rows
This commit is contained in:
		
							parent
							
								
									a6008b83b5
								
							
						
					
					
						commit
						de575037e6
					
				
					 1 changed files with 7 additions and 5 deletions
				
			
		|  | @ -125,11 +125,13 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, | |||
|                 for text_equiv in word.findall('./{%s}TextEquiv/{%s}Unicode' % (xmlns, xmlns)): | ||||
|                     text = text_equiv.text | ||||
| 
 | ||||
|                     points = [] | ||||
| 
 | ||||
|                     for coords in word.findall('./{%s}Coords' % xmlns): | ||||
| 
 | ||||
|                         # transform OCR coordinates using `scale_factor` to derive | ||||
|                         # correct coordinates for the web presentation image | ||||
|                         points = [int(scale_factor * float(pos)) | ||||
|                         points += [int(scale_factor * float(pos)) | ||||
|                                   for p in coords.attrib['points'].split(' ') for pos in p.split(',')] | ||||
| 
 | ||||
|                     x_points, y_points = points[0::2], points[1::2] | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue