🎨 dinglehopper: Unfuck substitutions a bit

pull/38/head
Gerber, Mike 4 years ago
parent e3e7938162
commit 6eb0a9350c

@ -1,21 +1,15 @@
import unicodedata import unicodedata
def substitute_equivalences(s): def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = { equivalences = {
'': 'ü',
'': 'ſſ', '': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ä',
'': 'ch', '': 'ch',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ck', '': 'ck',
'': 'll', '': 'll',
'': 'ö',
'': 'ſi', '': 'ſi',
'': 'ſt', '': 'ſt',
'': 'fi', '': 'fi',
@ -23,12 +17,7 @@ def substitute_equivalences(s):
'': 'fl', '': 'fl',
'': 'ffi', '': 'ffi',
'': 'ct', '': 'ct',
'': '\'',
'': '-',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uf532': 'as', # eMOP: Latin small ligature as '\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is '\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us '\uf534': 'us', # eMOP: Latin small ligature us
@ -37,10 +26,32 @@ def substitute_equivalences(s):
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST '': 'st', # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ä',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ö',
'': '\'',
'': '-',
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT '\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
} }
s = unicodedata.normalize('NFC', s) s = unicodedata.normalize('NFC', s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items(): for fr, to in equivalences.items():
s = s.replace(fr, to) s = s.replace(fr, to)
return s return s

Loading…
Cancel
Save