🎨 dinglehopper: Unfuck substitutions a bit

pull/38/head
Gerber, Mike 4 years ago
parent e3e7938162
commit 6eb0a9350c

@ -1,21 +1,15 @@
import unicodedata
def substitute_equivalences(s):
def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ä',
'': 'ch',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ck',
'': 'll',
'': 'ö',
'': 'ſi',
'': 'ſt',
'': 'fi',
@ -23,12 +17,7 @@ def substitute_equivalences(s):
'': 'fl',
'': 'ffi',
'': 'ct',
'': '\'',
'': '-',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
@ -37,10 +26,32 @@ def substitute_equivalences(s):
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ä',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ö',
'': '\'',
'': '-',
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s

Loading…
Cancel
Save