From 6eb0a9350cc3112ab61be0076542b02eab431eb9 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 12 Jun 2020 20:05:33 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A8=20dinglehopper:=20Unfuck=20substit?= =?UTF-8?q?utions=20a=20bit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dinglehopper/substitute_equivalences.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py index 1b7e0cf..39be276 100644 --- a/qurator/dinglehopper/substitute_equivalences.py +++ b/qurator/dinglehopper/substitute_equivalences.py @@ -1,21 +1,15 @@ import unicodedata -def substitute_equivalences(s): +def unjoin_ligatures(s): + """Unjoin ligatures, i.e. ff becomes ff.""" - # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR - # It might make sense to use different rules for GT and for the different OCR equivalences = { - '': 'ü', '': 'ſſ', "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I - '': 'ä', '': 'ch', - '==': '–', # → en-dash - '—': '–', # em-dash → en-dash '': 'ck', '': 'll', - '': 'ö', '': 'ſi', '': 'ſt', 'fi': 'fi', @@ -23,12 +17,7 @@ def substitute_equivalences(s): 'fl': 'fl', 'ffi': 'ffi', '': 'ct', - '’': '\'', - '⸗': '-', '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uf532': 'as', # eMOP: Latin small ligature as '\uf533': 'is', # eMOP: Latin small ligature is '\uf534': 'us', # eMOP: Latin small ligature us @@ -37,10 +26,32 @@ def substitute_equivalences(s): '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + } + s = unicodedata.normalize('NFC', s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s + + +def substitute_equivalences(s): + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ä', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ö', + '’': '\'', + '⸗': '-', + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT } s = unicodedata.normalize('NFC', s) + s = unjoin_ligatures(s) for fr, to in equivalences.items(): s = s.replace(fr, to) return s