mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-31 17:34:15 +01:00 
			
		
		
		
	🚧 dinglehopper: Extract text while retaining segment id info
This commit is contained in:
		
							parent
							
								
									5b353a2232
								
							
						
					
					
						commit
						5dbf563d6a
					
				
					 3 changed files with 66 additions and 59 deletions
				
			
		|  | @ -1,50 +0,0 @@ | ||||||
| import attr |  | ||||||
| import unicodedata |  | ||||||
| import enum |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped |  | ||||||
| # TODO types are not validated (attr does not do this yet) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @attr.s(frozen=True) |  | ||||||
| class ExtractedText: |  | ||||||
|     segments = attr.ib() |  | ||||||
|     joiner = attr.ib(type=str) |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def text(self): |  | ||||||
|         return self.joiner.join(s.text for s in self.segments) |  | ||||||
| 
 |  | ||||||
|     def segment_id_for_pos(self, pos): |  | ||||||
|         i = 0 |  | ||||||
|         for s in self.segments: |  | ||||||
|             if i <= pos < i + len(s.text): |  | ||||||
|                 return s.id |  | ||||||
|             i += len(s.text) |  | ||||||
|             if i <= pos < i + len(self.joiner): |  | ||||||
|                 return None |  | ||||||
|             i += len(self.joiner) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class Normalization(enum.Enum): |  | ||||||
|     NFC = 1 |  | ||||||
|     NFC_MUFI = 2 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def normalize(text, normalization): |  | ||||||
|     if normalization == Normalization.NFC: |  | ||||||
|         return unicodedata.normalize('NFC', text) |  | ||||||
|     else: |  | ||||||
|         raise ValueError() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @attr.s(frozen=True) |  | ||||||
| class ExtractedTextSegment: |  | ||||||
|     id = attr.ib(type=str) |  | ||||||
|     text = attr.ib(type=str) |  | ||||||
|     @text.validator |  | ||||||
|     def check(self, attribute, value): |  | ||||||
|         if normalize(value, self.normalization) != value: |  | ||||||
|             raise ValueError('String "{}" is not normalized.'.format(value)) |  | ||||||
|     normalization = attr.ib(converter=Normalization, default=Normalization.NFC) |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| import unicodedata | import unicodedata | ||||||
| import pytest | import pytest | ||||||
| from extracted_text import ExtractedText, ExtractedTextSegment | from qurator.dinglehopper import ExtractedText, ExtractedTextSegment | ||||||
| from uniseg.graphemecluster import grapheme_clusters | from uniseg.graphemecluster import grapheme_clusters | ||||||
| from qurator.dinglehopper import seq_align | from qurator.dinglehopper import seq_align | ||||||
| from collections import namedtuple | from collections import namedtuple | ||||||
|  |  | ||||||
|  | @ -3,9 +3,57 @@ from __future__ import division, print_function | ||||||
| from warnings import warn | from warnings import warn | ||||||
| 
 | 
 | ||||||
| from lxml import etree as ET | from lxml import etree as ET | ||||||
| import sys |  | ||||||
| 
 |  | ||||||
| from lxml.etree import XMLSyntaxError | from lxml.etree import XMLSyntaxError | ||||||
|  | import sys | ||||||
|  | import attr | ||||||
|  | import enum | ||||||
|  | import unicodedata | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @attr.s(frozen=True) | ||||||
|  | class ExtractedText: | ||||||
|  |     segments = attr.ib() | ||||||
|  |     joiner = attr.ib(type=str) | ||||||
|  |     # TODO Use type annotations for attr.ib types when support for Python 3.5 is dropped | ||||||
|  |     # TODO Types are not validated (attr does not do this yet) | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def text(self): | ||||||
|  |         return self.joiner.join(s.text for s in self.segments) | ||||||
|  | 
 | ||||||
|  |     def segment_id_for_pos(self, pos): | ||||||
|  |         i = 0 | ||||||
|  |         for s in self.segments: | ||||||
|  |             if i <= pos < i + len(s.text): | ||||||
|  |                 return s.id | ||||||
|  |             i += len(s.text) | ||||||
|  |             if i <= pos < i + len(self.joiner): | ||||||
|  |                 return None | ||||||
|  |             i += len(self.joiner) | ||||||
|  |         # XXX Cache results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Normalization(enum.Enum): | ||||||
|  |     NFC = 1 | ||||||
|  |     NFC_MUFI = 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def normalize(text, normalization): | ||||||
|  |     if normalization == Normalization.NFC: | ||||||
|  |         return unicodedata.normalize('NFC', text) | ||||||
|  |     else: | ||||||
|  |         raise ValueError() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @attr.s(frozen=True) | ||||||
|  | class ExtractedTextSegment: | ||||||
|  |     id = attr.ib(type=str) | ||||||
|  |     text = attr.ib(type=str) | ||||||
|  |     @text.validator | ||||||
|  |     def check(self, attribute, value): | ||||||
|  |         if normalize(value, self.normalization) != value: | ||||||
|  |             raise ValueError('String "{}" is not normalized.'.format(value)) | ||||||
|  |     normalization = attr.ib(converter=Normalization, default=Normalization.NFC) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def alto_namespace(tree): | def alto_namespace(tree): | ||||||
|  | @ -21,7 +69,7 @@ def alto_namespace(tree): | ||||||
|         raise ValueError('Not an ALTO tree') |         raise ValueError('Not an ALTO tree') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def alto_text(tree): | def alto_extract(tree): | ||||||
|     """Extract text from the given ALTO ElementTree.""" |     """Extract text from the given ALTO ElementTree.""" | ||||||
| 
 | 
 | ||||||
|     nsmap = {'alto': alto_namespace(tree)} |     nsmap = {'alto': alto_namespace(tree)} | ||||||
|  | @ -29,9 +77,15 @@ def alto_text(tree): | ||||||
|     lines = ( |     lines = ( | ||||||
|         ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) |         ' '.join(string.attrib.get('CONTENT') for string in line.iterfind('alto:String', namespaces=nsmap)) | ||||||
|         for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) |         for line in tree.iterfind('.//alto:TextLine', namespaces=nsmap)) | ||||||
|     text_ = '\n'.join(lines) |  | ||||||
| 
 | 
 | ||||||
|     return text_ |     return ExtractedText((ExtractedTextSegment(None, line_text) for line_text in lines), '\n') | ||||||
|  |     # TODO This currently does not extract any segment id, because we are | ||||||
|  |     #      clueless about the ALTO format. | ||||||
|  |     # FIXME needs to handle normalization | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def alto_text(tree): | ||||||
|  |     return alto_extract(tree).text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def page_namespace(tree): | def page_namespace(tree): | ||||||
|  | @ -47,7 +101,7 @@ def page_namespace(tree): | ||||||
|         raise ValueError('Not a PAGE tree') |         raise ValueError('Not a PAGE tree') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def page_text(tree): | def page_extract(tree): | ||||||
|     """Extract text from the given PAGE content ElementTree.""" |     """Extract text from the given PAGE content ElementTree.""" | ||||||
| 
 | 
 | ||||||
|     nsmap = {'page': page_namespace(tree)} |     nsmap = {'page': page_namespace(tree)} | ||||||
|  | @ -80,10 +134,13 @@ def page_text(tree): | ||||||
|     # XXX Does a file have to have regions etc.? region vs lines etc. |     # XXX Does a file have to have regions etc.? region vs lines etc. | ||||||
|     # Filter empty region texts |     # Filter empty region texts | ||||||
|     region_texts = (t for t in region_texts if t) |     region_texts = (t for t in region_texts if t) | ||||||
|  |     return ExtractedText((ExtractedTextSegment(None, region_text) for region_text in region_texts), '\n') | ||||||
|  |     # TODO This currently does not extract any segment id | ||||||
|  |     # FIXME needs to handle normalization | ||||||
| 
 | 
 | ||||||
|     text_ = '\n'.join(region_texts) |  | ||||||
| 
 | 
 | ||||||
|     return text_ | def page_text(tree): | ||||||
|  |     return page_extract(tree).text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def text(filename): | def text(filename): | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue