mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-20 00:59:58 +02:00
📝 dinglehopper: Add detail about the text extraction and ExtractedText
This commit is contained in:
parent
7025ea54a8
commit
c6c6b8efab
1 changed files with 21 additions and 4 deletions
|
@ -35,10 +35,27 @@ def normalize_sbb(t):
|
|||
@attr.s(frozen=True)
|
||||
class ExtractedText:
|
||||
"""
|
||||
Extracted text
|
||||
Extracted text.
|
||||
|
||||
Objects of this class are guaranteed to be a. always in their normalization and
|
||||
b. in NFC.
|
||||
We need a segment id for each extracted text segment. As this should support
|
||||
extracting from the word (or even glyph) level, we need to have a
|
||||
hierarchical representation of the
|
||||
text due to the different "joiners" needed on each level.
|
||||
|
||||
For example, here is pseudo code to get the text of a page:
|
||||
|
||||
* from region texts:
|
||||
`'\n'.join(region_texts)`
|
||||
* from line texts:
|
||||
`'\n'.join('\n'.join(line_texts) for every region`)
|
||||
* from word texts:
|
||||
`'\n'.join(('\n'.join(' '.join(word_texts) for every line) for every region))`
|
||||
|
||||
An ExtractedText object either contains a text itself or has child segments
|
||||
(and a joiner), not both.
|
||||
|
||||
Objects of this class are guaranteed to be a. always in their normalization
|
||||
and b. in NFC.
|
||||
"""
|
||||
segment_id = attr.ib(type=Optional[str])
|
||||
|
||||
|
@ -115,4 +132,4 @@ class ExtractedText:
|
|||
@classmethod
|
||||
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||
normalized_text = normalize(text, normalization)
|
||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue