🧹 dinglehopper: Move all normalization code to extracted_text.py

2025-12-14 22:54:13 +01:00 · 2020-10-08 17:29:25 +02:00 · 2020-10-08 17:29:25 +02:00 · 82217a25bb
commit 82217a25bb
parent 009fa55c09
3 changed files with 57 additions and 60 deletions
--- a/qurator/dinglehopper/init.py
+++ b/qurator/dinglehopper/init.py
@ -1,5 +1,5 @@
 from .ocr_files import *
-from .substitute_equivalences import *
+from .extracted_text import *
 from .character_error_rate import *
 from .word_error_rate import *
 from .align import *
--- a/qurator/dinglehopper/extracted_text.py
+++ b/qurator/dinglehopper/extracted_text.py
@ -7,8 +7,6 @@ from typing import Optional

 import attr

-from .substitute_equivalences import substitute_equivalences
-

 class Normalization(enum.Enum):
    NFC = 1
@ -32,6 +30,62 @@ def normalize_sbb(t):
    return normalize(t, Normalization.NFC_SBB)


+def unjoin_ligatures(s):
+    """Unjoin ligatures, i.e. ﬀ becomes ff."""
+
+    equivalences = {
+        '': 'ſſ',
+        "\ueba7": 'ſſi',  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
+        '': 'ch',
+        '': 'ck',
+        '': 'll',
+        '': 'ſi',
+        '': 'ſt',
+        'ﬁ': 'fi',
+        'ﬀ': 'ff',
+        'ﬂ': 'fl',
+        'ﬃ': 'ffi',
+        '': 'ct',
+        '': 'tz',       # MUFI: LATIN SMALL LIGATURE TZ
+        '\uf532': 'as',  # eMOP: Latin small ligature as
+        '\uf533': 'is',  # eMOP: Latin small ligature is
+        '\uf534': 'us',  # eMOP: Latin small ligature us
+        '\uf535': 'Qu',  # eMOP: Latin ligature capital Q small u
+        'ĳ': 'ij',       # U+0133 LATIN SMALL LIGATURE IJ
+        '\uE8BF': 'q&',  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET  XXX How to replace this correctly?
+        '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
+        'ﬆ': 'st',      # U+FB06 LATIN SMALL LIGATURE ST
+    }
+    s = unicodedata.normalize('NFC', s)
+    for fr, to in equivalences.items():
+        s = s.replace(fr, to)
+    return s
+
+
+def substitute_equivalences(s):
+    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
+    # It might make sense to use different rules for GT and for the different OCR
+    equivalences = {
+        '': 'ü',
+        '': 'ä',
+        '==': '–',  # → en-dash
+        '—': '–',   # em-dash → en-dash
+        '': 'ö',
+        '’': '\'',
+        '⸗': '-',
+        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
+        '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
+    }
+
+    s = unicodedata.normalize('NFC', s)
+    s = unjoin_ligatures(s)
+    for fr, to in equivalences.items():
+        s = s.replace(fr, to)
+    return s
+
+
@attr.s(frozen=True)
 class ExtractedText:
    """
--- a/qurator/dinglehopper/substitute_equivalences.py
+++ b/qurator/dinglehopper/substitute_equivalences.py
@ -1,57 +0,0 @@
-import unicodedata
-
-
-def unjoin_ligatures(s):
-    """Unjoin ligatures, i.e. ﬀ becomes ff."""
-
-    equivalences = {
-        '': 'ſſ',
-        "\ueba7": 'ſſi',  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
-        '': 'ch',
-        '': 'ck',
-        '': 'll',
-        '': 'ſi',
-        '': 'ſt',
-        'ﬁ': 'fi',
-        'ﬀ': 'ff',
-        'ﬂ': 'fl',
-        'ﬃ': 'ffi',
-        '': 'ct',
-        '': 'tz',       # MUFI: LATIN SMALL LIGATURE TZ
-        '\uf532': 'as',  # eMOP: Latin small ligature as
-        '\uf533': 'is',  # eMOP: Latin small ligature is
-        '\uf534': 'us',  # eMOP: Latin small ligature us
-        '\uf535': 'Qu',  # eMOP: Latin ligature capital Q small u
-        'ĳ': 'ij',       # U+0133 LATIN SMALL LIGATURE IJ
-        '\uE8BF': 'q&',  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET  XXX How to replace this correctly?
-        '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
-        'ﬆ': 'st',      # U+FB06 LATIN SMALL LIGATURE ST
-    }
-    s = unicodedata.normalize('NFC', s)
-    for fr, to in equivalences.items():
-        s = s.replace(fr, to)
-    return s
-
-
-def substitute_equivalences(s):
-    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
-    # It might make sense to use different rules for GT and for the different OCR
-    equivalences = {
-        '': 'ü',
-        '': 'ä',
-        '==': '–',  # → en-dash
-        '—': '–',   # em-dash → en-dash
-        '': 'ö',
-        '’': '\'',
-        '⸗': '-',
-        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
-        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
-        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
-        '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
-    }
-
-    s = unicodedata.normalize('NFC', s)
-    s = unjoin_ligatures(s)
-    for fr, to in equivalences.items():
-        s = s.replace(fr, to)
-    return s