mirror of
				https://github.com/mikegerber/ocrd_calamari.git
				synced 2025-10-31 15:54:13 +01:00 
			
		
		
		
	✨ Do word segmentation as expected by OCR-D PAGE specs
This commit is contained in:
		
							parent
							
								
									0f9c94e7dc
								
							
						
					
					
						commit
						6f4736f8e4
					
				
					 3 changed files with 28 additions and 25 deletions
				
			
		
							
								
								
									
										12
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
										
									
									
									
								
							|  | @ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta | |||
| image) as its input. | ||||
| 
 | ||||
| In addition to the line text it also outputs glyph segmentation including | ||||
| per-glyph confidence values and per-glyph alternative predictions as provided | ||||
| by the Calamari OCR engine. Note that while Calamari does not provide word | ||||
| segmentation, this processor produces word segmentation inferred from Unicode | ||||
| text segmentation and the glyph positions. The provided glyph and word | ||||
| segmentation can be used for text extraction and highlighting, but is probably | ||||
| not useful for further image-based processing. | ||||
| per-glyph confidence values and per-glyph alternative predictions as provided by | ||||
| the Calamari OCR engine. Note that while Calamari does not provide word | ||||
| segmentation, this processor produces word segmentation inferred from text | ||||
| segmentation and the glyph positions. The provided glyph and word segmentation | ||||
| can be used for text extraction and highlighting, but is probably not useful for | ||||
| further image-based processing. | ||||
| 
 | ||||
| ## Installation | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,7 +4,6 @@ import os | |||
| from glob import glob | ||||
| 
 | ||||
| import numpy as np | ||||
| import uniseg.wordbreak | ||||
| from calamari_ocr.ocr import MultiPredictor | ||||
| from calamari_ocr.ocr.voting import voter_from_proto | ||||
| from calamari_ocr.proto import VoterParams | ||||
|  | @ -101,26 +100,32 @@ class CalamariRecognize(Processor): | |||
| 
 | ||||
|                     # Save word results | ||||
|                     # | ||||
|                     # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text | ||||
|                     # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces | ||||
|                     # a strict hierarchy of lines > words > glyphs. | ||||
|                     # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation | ||||
|                     # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict | ||||
|                     # hierarchy of lines > words > glyphs. | ||||
| 
 | ||||
|                     def unwanted(c): | ||||
|                         """ | ||||
|                         Define unwanted characters | ||||
| 
 | ||||
|                         Words only containing these e.g. whitespace characters are not considered as words. | ||||
|                         """ | ||||
|                         return c == " " | ||||
|                     def _words(s): | ||||
|                         """Split words based on spaces and include spaces as 'words'""" | ||||
|                         spaces = None | ||||
|                         word = '' | ||||
|                         for c in s: | ||||
|                             if c == ' ' and spaces is True: | ||||
|                                 word += c | ||||
|                             elif c != ' ' and spaces is False: | ||||
|                                 word += c | ||||
|                             else: | ||||
|                                 if word: | ||||
|                                     yield word | ||||
|                                 word = c | ||||
|                                 spaces = (c == ' ') | ||||
|                         yield word | ||||
| 
 | ||||
|                     word_no = 0 | ||||
|                     i = 0 | ||||
|                     for word_text in uniseg.wordbreak.words(prediction.sentence): | ||||
|                         # XXX Re-use word segmentation from dinglehopper, i.e. support private use characters | ||||
|                         word_length = len(word_text) | ||||
|                         do_not_include = all(unwanted(c) for c in word_text) | ||||
| 
 | ||||
|                         if not do_not_include: | ||||
|                     for word_text in _words(prediction.sentence): | ||||
|                         word_length = len(word_text) | ||||
|                         if not all(c == ' ' for c in word_text): | ||||
|                             word_positions = prediction.positions[i:i+word_length] | ||||
|                             word_start = word_positions[0].global_start | ||||
|                             word_end = word_positions[-1].global_end | ||||
|  | @ -152,10 +157,9 @@ class CalamariRecognize(Processor): | |||
|                                 word.add_Glyph(glyph) | ||||
| 
 | ||||
|                             line.add_Word(word) | ||||
| 
 | ||||
|                             word_no += 1 | ||||
| 
 | ||||
|                         i += word_length | ||||
|                         word_no += 1 | ||||
| 
 | ||||
| 
 | ||||
|             _page_update_higher_textequiv_levels('line', pcgts) | ||||
|  |  | |||
|  | @ -4,4 +4,3 @@ calamari-ocr == 0.3.5 | |||
| setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime? | ||||
| click | ||||
| ocrd >= 2.2.1 | ||||
| uniseg | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue