From 6eb0a9350cc3112ab61be0076542b02eab431eb9 Mon Sep 17 00:00:00 2001
From: "Gerber, Mike" <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 12 Jun 2020 20:05:33 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Unfuck=20substit?=
 =?UTF-8?q?utions=20a=20bit?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../dinglehopper/substitute_equivalences.py   | 41 ++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py
index 1b7e0cf..39be276 100644
--- a/qurator/dinglehopper/substitute_equivalences.py
+++ b/qurator/dinglehopper/substitute_equivalences.py
@@ -1,21 +1,15 @@
 import unicodedata
 
 
-def substitute_equivalences(s):
+def unjoin_ligatures(s):
+    """Unjoin ligatures, i.e. ﬀ becomes ff."""
 
-    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
-    # It might make sense to use different rules for GT and for the different OCR
     equivalences = {
-        '': 'ü',
         '': 'ſſ',
         "\ueba7": 'ſſi',  # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
-        '': 'ä',
         '': 'ch',
-        '==': '–',  # → en-dash
-        '—': '–',   # em-dash → en-dash
         '': 'ck',
         '': 'll',
-        '': 'ö',
         '': 'ſi',
         '': 'ſt',
         'ﬁ': 'fi',
@@ -23,12 +17,7 @@ def substitute_equivalences(s):
         'ﬂ': 'fl',
         'ﬃ': 'ffi',
         '': 'ct',
-        '’': '\'',
-        '⸗': '-',
         '': 'tz',       # MUFI: LATIN SMALL LIGATURE TZ
-        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
-        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
-        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
         '\uf532': 'as',  # eMOP: Latin small ligature as
         '\uf533': 'is',  # eMOP: Latin small ligature is
         '\uf534': 'us',  # eMOP: Latin small ligature us
@@ -37,10 +26,32 @@ def substitute_equivalences(s):
         '\uE8BF': 'q&',  # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET  XXX How to replace this correctly?
         '\uEBA5': 'ſp',  # MUFI: LATIN SMALL LIGATURE LONG S P
         'ﬆ': 'st',      # U+FB06 LATIN SMALL LIGATURE ST
-        '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
     }
-
     s = unicodedata.normalize('NFC', s)
     for fr, to in equivalences.items():
         s = s.replace(fr, to)
     return s
+
+
+def substitute_equivalences(s):
+    # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
+    # It might make sense to use different rules for GT and for the different OCR
+    equivalences = {
+        '': 'ü',
+        '': 'ä',
+        '==': '–',  # → en-dash
+        '—': '–',   # em-dash → en-dash
+        '': 'ö',
+        '’': '\'',
+        '⸗': '-',
+        'aͤ': 'ä',        # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
+        'oͤ': 'ö',        # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
+        'uͤ': 'ü',        # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
+        '\uF50E': 'q́'    # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
+    }
+
+    s = unicodedata.normalize('NFC', s)
+    s = unjoin_ligatures(s)
+    for fr, to in equivalences.items():
+        s = s.replace(fr, to)
+    return s