mirror of
				https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
				synced 2025-10-31 00:44:13 +01:00 
			
		
		
		
	relax concatenation test: text must be equal irrespective of tokenization/joiner
This commit is contained in:
		
							parent
							
								
									6ee105b17c
								
							
						
					
					
						commit
						f829015bb5
					
				
					 1 changed files with 5 additions and 3 deletions
				
			
		|  | @ -126,11 +126,12 @@ def _fix_words(line, page_id, reverse=False): | |||
|                               key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) | ||||
|         sorted_words_text = get_text(sorted_words, ' ') | ||||
| 
 | ||||
|         if sorted_words_text == line_text: | ||||
|         if (sorted_words_text == line_text or | ||||
|             sorted_words_text.replace(' ', '') == line_text.replace(' ', '')): | ||||
|             LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) | ||||
|             line.set_Word(sorted_words) | ||||
|         else: | ||||
|             LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||
|             LOG.debug('Resorting lines of page "%s" line "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||
|                       page_id, line.id, | ||||
|                       str([word.id for word in words]), | ||||
|                       str([word.id for word in sorted_words]), | ||||
|  | @ -174,7 +175,8 @@ def _fix_lines(region, page_id, reverse=False): | |||
|                               key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) | ||||
|         sorted_lines_text = get_text(sorted_lines, '\n') | ||||
| 
 | ||||
|         if sorted_lines_text == region_text: | ||||
|         if (sorted_lines_text == region_text or | ||||
|             sorted_lines_text.replace('\n', '') == region_text.replace('\n', '')): | ||||
|             LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) | ||||
|             region.set_TextLine(sorted_lines) | ||||
|         else: | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue