mirror of
				https://github.com/mikegerber/ocrd_calamari.git
				synced 2025-10-30 23:34:13 +01:00 
			
		
		
		
	✨ Do word segmentation as expected by OCR-D PAGE specs
This commit is contained in:
		
							parent
							
								
									0f9c94e7dc
								
							
						
					
					
						commit
						6f4736f8e4
					
				
					 3 changed files with 28 additions and 25 deletions
				
			
		
							
								
								
									
										12
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
										
									
									
									
								
							|  | @ -14,12 +14,12 @@ This processor only operates on the text line level and so needs a line segmenta | ||||||
| image) as its input. | image) as its input. | ||||||
| 
 | 
 | ||||||
| In addition to the line text it also outputs glyph segmentation including | In addition to the line text it also outputs glyph segmentation including | ||||||
| per-glyph confidence values and per-glyph alternative predictions as provided | per-glyph confidence values and per-glyph alternative predictions as provided by | ||||||
| by the Calamari OCR engine. Note that while Calamari does not provide word | the Calamari OCR engine. Note that while Calamari does not provide word | ||||||
| segmentation, this processor produces word segmentation inferred from Unicode | segmentation, this processor produces word segmentation inferred from text | ||||||
| text segmentation and the glyph positions. The provided glyph and word | segmentation and the glyph positions. The provided glyph and word segmentation | ||||||
| segmentation can be used for text extraction and highlighting, but is probably | can be used for text extraction and highlighting, but is probably not useful for | ||||||
| not useful for further image-based processing. | further image-based processing. | ||||||
| 
 | 
 | ||||||
| ## Installation | ## Installation | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -4,7 +4,6 @@ import os | ||||||
| from glob import glob | from glob import glob | ||||||
| 
 | 
 | ||||||
| import numpy as np | import numpy as np | ||||||
| import uniseg.wordbreak |  | ||||||
| from calamari_ocr.ocr import MultiPredictor | from calamari_ocr.ocr import MultiPredictor | ||||||
| from calamari_ocr.ocr.voting import voter_from_proto | from calamari_ocr.ocr.voting import voter_from_proto | ||||||
| from calamari_ocr.proto import VoterParams | from calamari_ocr.proto import VoterParams | ||||||
|  | @ -101,26 +100,32 @@ class CalamariRecognize(Processor): | ||||||
| 
 | 
 | ||||||
|                     # Save word results |                     # Save word results | ||||||
|                     # |                     # | ||||||
|                     # Calamari OCR does not provide word positions, so we infer word positions from a. Unicode text |                     # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation | ||||||
|                     # segmentation and b. the glyph positions. This is necessary because the PAGE XML format enforces |                     # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict | ||||||
|                     # a strict hierarchy of lines > words > glyphs. |                     # hierarchy of lines > words > glyphs. | ||||||
| 
 | 
 | ||||||
|                     def unwanted(c): |                     def _words(s): | ||||||
|                         """ |                         """Split words based on spaces and include spaces as 'words'""" | ||||||
|                         Define unwanted characters |                         spaces = None | ||||||
| 
 |                         word = '' | ||||||
|                         Words only containing these e.g. whitespace characters are not considered as words. |                         for c in s: | ||||||
|                         """ |                             if c == ' ' and spaces is True: | ||||||
|                         return c == " " |                                 word += c | ||||||
|  |                             elif c != ' ' and spaces is False: | ||||||
|  |                                 word += c | ||||||
|  |                             else: | ||||||
|  |                                 if word: | ||||||
|  |                                     yield word | ||||||
|  |                                 word = c | ||||||
|  |                                 spaces = (c == ' ') | ||||||
|  |                         yield word | ||||||
| 
 | 
 | ||||||
|                     word_no = 0 |                     word_no = 0 | ||||||
|                     i = 0 |                     i = 0 | ||||||
|                     for word_text in uniseg.wordbreak.words(prediction.sentence): |  | ||||||
|                         # XXX Re-use word segmentation from dinglehopper, i.e. support private use characters |  | ||||||
|                         word_length = len(word_text) |  | ||||||
|                         do_not_include = all(unwanted(c) for c in word_text) |  | ||||||
| 
 | 
 | ||||||
|                         if not do_not_include: |                     for word_text in _words(prediction.sentence): | ||||||
|  |                         word_length = len(word_text) | ||||||
|  |                         if not all(c == ' ' for c in word_text): | ||||||
|                             word_positions = prediction.positions[i:i+word_length] |                             word_positions = prediction.positions[i:i+word_length] | ||||||
|                             word_start = word_positions[0].global_start |                             word_start = word_positions[0].global_start | ||||||
|                             word_end = word_positions[-1].global_end |                             word_end = word_positions[-1].global_end | ||||||
|  | @ -152,10 +157,9 @@ class CalamariRecognize(Processor): | ||||||
|                                 word.add_Glyph(glyph) |                                 word.add_Glyph(glyph) | ||||||
| 
 | 
 | ||||||
|                             line.add_Word(word) |                             line.add_Word(word) | ||||||
| 
 |                             word_no += 1 | ||||||
| 
 | 
 | ||||||
|                         i += word_length |                         i += word_length | ||||||
|                         word_no += 1 |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|             _page_update_higher_textequiv_levels('line', pcgts) |             _page_update_higher_textequiv_levels('line', pcgts) | ||||||
|  |  | ||||||
|  | @ -4,4 +4,3 @@ calamari-ocr == 0.3.5 | ||||||
| setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime? | setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime? | ||||||
| click | click | ||||||
| ocrd >= 2.2.1 | ocrd >= 2.2.1 | ||||||
| uniseg |  | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue