mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-26 15:04:15 +01:00 
			
		
		
		
	🚧 dinglehopper: WIP data structure for extracted text
This commit is contained in:
		
							parent
							
								
									96273b026d
								
							
						
					
					
						commit
						5a5e3c824b
					
				
					 2 changed files with 16 additions and 8 deletions
				
			
		|  | @ -1,5 +1,6 @@ | |||
| import attr | ||||
| import unicodedata | ||||
| import enum | ||||
| 
 | ||||
| 
 | ||||
| # TODO handle grapheme cluster positions? | ||||
|  | @ -27,11 +28,13 @@ class ExtractedText: | |||
|             i += len(self.joiner) | ||||
| 
 | ||||
| 
 | ||||
| NORM_NFC = 0 | ||||
| class Normalization(enum.Enum): | ||||
|     NFC = 1 | ||||
|     NFC_MUFI = 2 | ||||
| 
 | ||||
| 
 | ||||
| def normalize(text, normalization): | ||||
|     if normalization == NORM_NFC: | ||||
|     if normalization == Normalization.NFC: | ||||
|         return unicodedata.normalize('NFC', text) | ||||
|     else: | ||||
|         raise ValueError() | ||||
|  | @ -45,4 +48,4 @@ class ExtractedTextSegment: | |||
|     def check(self, attribute, value): | ||||
|         if normalize(value, self.normalization) != value: | ||||
|             raise ValueError('String "{}" is not normalized.'.format(value)) | ||||
|     normalization = attr.ib(default=NORM_NFC) | ||||
|     normalization = attr.ib(converter=Normalization, default=Normalization.NFC) | ||||
|  |  | |||
|  | @ -1,17 +1,22 @@ | |||
| from extracted_text import * | ||||
| import unicodedata | ||||
| import pytest | ||||
| from extracted_text import ExtractedText, ExtractedTextSegment | ||||
| 
 | ||||
| 
 | ||||
| def test_text(): | ||||
|     test1 = ExtractedText([ | ||||
|         ExtractedTextSegment('s0', 'foo'), | ||||
|         ExtractedTextSegment('s1', 'bar'), | ||||
|         ExtractedTextSegment(1, 'bar'), | ||||
|         ExtractedTextSegment('s2', 'bazinga') | ||||
|     ], ' ') | ||||
| 
 | ||||
| 
 | ||||
|     assert test1.text == 'foo bar bazinga' | ||||
|     assert test1.segment_id_for_pos(0) == 's0' | ||||
|     assert test1.segment_id_for_pos(3) is None | ||||
|     assert test1.segment_id_for_pos(10) == 's2' | ||||
| 
 | ||||
| # ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) | ||||
| ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) | ||||
| 
 | ||||
| def test_normalization_check(): | ||||
|     with pytest.raises(ValueError, match=r'.*is not normalized.*'): | ||||
|         ExtractedTextSegment('foo', unicodedata.normalize('NFD', 'Schlyñ')) | ||||
|     assert ExtractedTextSegment('foo', unicodedata.normalize('NFC', 'Schlyñ')) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue