📝 dinglehopper: Add detail about the text extraction and ExtractedText

2025-12-13 06:04:13 +01:00 · 2020-10-08 17:05:36 +02:00 · 2020-10-08 17:05:36 +02:00 · c6c6b8efab
commit c6c6b8efab
parent 7025ea54a8
1 changed files with 21 additions and 4 deletions
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -35,10 +35,27 @@ def normalize_sbb(t):
@attr.s(frozen=True)
 class ExtractedText:
    """
-    Extracted text
+    Extracted text.
-    Objects of this class are guaranteed to be a. always in their normalization and
+    We need a segment id for each extracted text segment. As this should support
-    b. in NFC.
+    extracting from the word (or even glyph) level, we need to have a
    hierarchical representation of the
    text due to the different "joiners" needed on each level.
    For example, here is pseudo code to get the text of a page:
    * from region texts:
      `'\n'.join(region_texts)`
    * from line texts:
      `'\n'.join('\n'.join(line_texts) for every region`)
    * from word texts:
      `'\n'.join(('\n'.join(' '.join(word_texts) for every line) for every region))`
    An ExtractedText object either contains a text itself or has child segments
    (and a joiner), not both.
    Objects of this class are guaranteed to be a. always in their normalization
    and b. in NFC.
    """
    segment_id = attr.ib(type=Optional[str])
@ -115,4 +132,4 @@ class ExtractedText:
    @classmethod
    def from_str(cls, text, normalization=Normalization.NFC_SBB):
        normalized_text = normalize(text, normalization)
-        return cls(None, None, None, normalized_text, normalization=normalization)
+        return cls(None, None, None, normalized_text, normalization=normalization)