mirror of
				https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
				synced 2025-10-31 08:54:13 +01:00 
			
		
		
		
	
						commit
						6ee105b17c
					
				
					 3 changed files with 75 additions and 18 deletions
				
			
		
							
								
								
									
										0
									
								
								ocrd_repair_inconsistencies/__init__.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								ocrd_repair_inconsistencies/__init__.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -1,7 +1,8 @@ | ||||||
| import click | import click | ||||||
| 
 | 
 | ||||||
| from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | ||||||
| from ocrd_repair_inconsistencies.ocrd_repair_inconsistencies import RepairInconsistencies | 
 | ||||||
|  | from .ocrd_repair_inconsistencies import RepairInconsistencies | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @click.command() | @click.command() | ||||||
|  |  | ||||||
|  | @ -37,22 +37,51 @@ class RepairInconsistencies(Processor): | ||||||
|             regions = page.get_TextRegion() |             regions = page.get_TextRegion() | ||||||
| 
 | 
 | ||||||
|             for region in regions: |             for region in regions: | ||||||
|                 if region.readingDirection != 'left-to-right': |                 textLineOrder = 'top-to-bottom' | ||||||
|                     LOG.info('Not processing region "%s" (not left-to-right)', region.id) |                 for segment in [region, page]: | ||||||
|  |                     if segment.textLineOrder is None: | ||||||
|                         continue |                         continue | ||||||
|                 if len(region.get_TextLine()) > 1 and region.textLineOrder != 'top-to-bottom': |                     else: | ||||||
|                     LOG.info('Not processing region "%s" (not top-to-bottom)', region.id) |                         textLineOrder = segment.textLineOrder | ||||||
|  |                         break | ||||||
|  |                 if textLineOrder not in ['top-to-bottom', 'bottom-to-top']: | ||||||
|  |                     LOG.info('Not processing page "%s" region "%s" (textLineOrder=%s)', | ||||||
|  |                              page_id, region.id, textLineOrder) | ||||||
|                     continue |                     continue | ||||||
| 
 | 
 | ||||||
|                 _fix_lines(region) |                 _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top') | ||||||
| 
 | 
 | ||||||
|                 lines = region.get_TextLine() |                 lines = region.get_TextLine() | ||||||
|                 for line in lines: |                 for line in lines: | ||||||
|                     _fix_words(line) |                     readingDirection = 'left-to-right' | ||||||
|  |                     for segment in [line, region, page]: | ||||||
|  |                         if segment.readingDirection is None: | ||||||
|  |                             continue | ||||||
|  |                         else: | ||||||
|  |                             readingDirection = segment.readingDirection | ||||||
|  |                             break | ||||||
|  |                     if readingDirection not in ['left-to-right', 'right-to-left']: | ||||||
|  |                         LOG.info('Not processing page "%s" line "%s" (readingDirection=%s)', | ||||||
|  |                                  page_id, line.id, readingDirection) | ||||||
|  |                         continue | ||||||
|  |                      | ||||||
|  |                     _fix_words(line, page_id, reverse=readingDirection=='right-to-left') | ||||||
| 
 | 
 | ||||||
|                     words = line.get_Word() |                     words = line.get_Word() | ||||||
|                     for word in words: |                     for word in words: | ||||||
|                         _fix_glyphs(word) |                         readingDirection = 'left-to-right' | ||||||
|  |                         for segment in [word, line, region, page]: | ||||||
|  |                             if segment.readingDirection is None: | ||||||
|  |                                 continue | ||||||
|  |                             else: | ||||||
|  |                                 readingDirection = segment.readingDirection | ||||||
|  |                                 break | ||||||
|  |                         if readingDirection not in ['left-to-right', 'right-to-left']: | ||||||
|  |                             LOG.info('Not processing page "%s" word "%s" (readingDirection=%s)', | ||||||
|  |                                      page_id, word.id, readingDirection) | ||||||
|  |                             continue | ||||||
|  | 
 | ||||||
|  |                         _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left') | ||||||
| 
 | 
 | ||||||
|             file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) |             file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) | ||||||
|             if file_id == input_file.ID: |             if file_id == input_file.ID: | ||||||
|  | @ -84,46 +113,73 @@ def get_text(thing, joiner=None): | ||||||
|     return text |     return text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _fix_words(line): | def _fix_words(line, page_id, reverse=False): | ||||||
|     """Fix word order in a line""" |     """Fix word order in a line""" | ||||||
| 
 | 
 | ||||||
|     words = line.get_Word() |     words = line.get_Word() | ||||||
|  |     if not words: | ||||||
|  |         return | ||||||
|     line_text = get_text(line) |     line_text = get_text(line) | ||||||
|     words_text = get_text(words, ' ') |     words_text = get_text(words, ' ') | ||||||
|     if line_text != words_text: |     if line_text != words_text: | ||||||
|         sorted_words = sorted(words, key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) |         sorted_words = sorted(words, reverse=reverse, | ||||||
|  |                               key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) | ||||||
|         sorted_words_text = get_text(sorted_words, ' ') |         sorted_words_text = get_text(sorted_words, ' ') | ||||||
| 
 | 
 | ||||||
|         if sorted_words_text == line_text: |         if sorted_words_text == line_text: | ||||||
|             LOG.info('Fixing word order of line "%s"', line.id) |             LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) | ||||||
|             line.set_Word(sorted_words) |             line.set_Word(sorted_words) | ||||||
|  |         else: | ||||||
|  |             LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||||
|  |                       page_id, line.id, | ||||||
|  |                       str([word.id for word in words]), | ||||||
|  |                       str([word.id for word in sorted_words]), | ||||||
|  |                       words_text, line_text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _fix_glyphs(word): | def _fix_glyphs(word, page_id, reverse=False): | ||||||
|     """Fix glyph order in a word""" |     """Fix glyph order in a word""" | ||||||
| 
 | 
 | ||||||
|     glyphs = word.get_Glyph() |     glyphs = word.get_Glyph() | ||||||
|  |     if not glyphs: | ||||||
|  |         return | ||||||
|     word_text = get_text(word) |     word_text = get_text(word) | ||||||
|     glyphs_text = get_text(glyphs, '') |     glyphs_text = get_text(glyphs, '') | ||||||
|     if word_text != glyphs_text: |     if word_text != glyphs_text: | ||||||
|         sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) |         sorted_glyphs = sorted(glyphs, reverse=reverse, | ||||||
|  |                                key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) | ||||||
|         sorted_glyphs_text = get_text(sorted_glyphs, '') |         sorted_glyphs_text = get_text(sorted_glyphs, '') | ||||||
| 
 | 
 | ||||||
|         if sorted_glyphs_text == word_text: |         if sorted_glyphs_text == word_text: | ||||||
|             LOG.info('Fixing glyph order of word "%s"', word.id) |             LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id) | ||||||
|             word.set_Glyph(sorted_glyphs) |             word.set_Glyph(sorted_glyphs) | ||||||
|  |         else: | ||||||
|  |             LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||||
|  |                       page_id, word.id, | ||||||
|  |                       str([glyph.id for glyph in glyphs]), | ||||||
|  |                       str([glyph.id for glyph in sorted_glyphs]), | ||||||
|  |                       glyphs_text, word_text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _fix_lines(region): | def _fix_lines(region, page_id, reverse=False): | ||||||
|     """Fix line order in a region""" |     """Fix line order in a region""" | ||||||
| 
 | 
 | ||||||
|     lines = region.get_TextLine() |     lines = region.get_TextLine() | ||||||
|  |     if not lines: | ||||||
|  |         return | ||||||
|     region_text = get_text(region) |     region_text = get_text(region) | ||||||
|     lines_text = get_text(lines, '\n') |     lines_text = get_text(lines, '\n') | ||||||
|     if region_text != lines_text: |     if region_text != lines_text: | ||||||
|         sorted_lines = sorted(lines, key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) |         sorted_lines = sorted(lines, reverse=reverse, | ||||||
|  |                               key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) | ||||||
|         sorted_lines_text = get_text(sorted_lines, '\n') |         sorted_lines_text = get_text(sorted_lines, '\n') | ||||||
| 
 | 
 | ||||||
|         if sorted_lines_text == region_text: |         if sorted_lines_text == region_text: | ||||||
|             LOG.info('Fixing line order of region "%s"', region.id) |             LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) | ||||||
|             region.set_TextLine(sorted_lines) |             region.set_TextLine(sorted_lines) | ||||||
|  |         else: | ||||||
|  |             LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||||
|  |                       page_id, region.id, | ||||||
|  |                       str([line.id for line in lines]), | ||||||
|  |                       str([line.id for line in sorted_lines]), | ||||||
|  |                       lines_text, region_text) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue