mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-20 00:59:58 +02:00
📝 dinglehopper: Add detail about the text extraction and ExtractedText
This commit is contained in:
parent
7025ea54a8
commit
c6c6b8efab
1 changed files with 21 additions and 4 deletions
|
@ -35,10 +35,27 @@ def normalize_sbb(t):
|
||||||
@attr.s(frozen=True)
|
@attr.s(frozen=True)
|
||||||
class ExtractedText:
|
class ExtractedText:
|
||||||
"""
|
"""
|
||||||
Extracted text
|
Extracted text.
|
||||||
|
|
||||||
Objects of this class are guaranteed to be a. always in their normalization and
|
We need a segment id for each extracted text segment. As this should support
|
||||||
b. in NFC.
|
extracting from the word (or even glyph) level, we need to have a
|
||||||
|
hierarchical representation of the
|
||||||
|
text due to the different "joiners" needed on each level.
|
||||||
|
|
||||||
|
For example, here is pseudo code to get the text of a page:
|
||||||
|
|
||||||
|
* from region texts:
|
||||||
|
`'\n'.join(region_texts)`
|
||||||
|
* from line texts:
|
||||||
|
`'\n'.join('\n'.join(line_texts) for every region`)
|
||||||
|
* from word texts:
|
||||||
|
`'\n'.join(('\n'.join(' '.join(word_texts) for every line) for every region))`
|
||||||
|
|
||||||
|
An ExtractedText object either contains a text itself or has child segments
|
||||||
|
(and a joiner), not both.
|
||||||
|
|
||||||
|
Objects of this class are guaranteed to be a. always in their normalization
|
||||||
|
and b. in NFC.
|
||||||
"""
|
"""
|
||||||
segment_id = attr.ib(type=Optional[str])
|
segment_id = attr.ib(type=Optional[str])
|
||||||
|
|
||||||
|
@ -115,4 +132,4 @@ class ExtractedText:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
def from_str(cls, text, normalization=Normalization.NFC_SBB):
|
||||||
normalized_text = normalize(text, normalization)
|
normalized_text = normalize(text, normalization)
|
||||||
return cls(None, None, None, normalized_text, normalization=normalization)
|
return cls(None, None, None, normalized_text, normalization=normalization)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue