1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-09 10:29:56 +02:00
dinglehopper/qurator/dinglehopper/substitute_equivalences.py
Gerber, Mike f94e8b9b1c Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector"
This reverts commit a3c1eee8f31349edcfb1e36920763bcecceb1129, reversing
changes made to dc76213ffc1fbabc2c45f0e52ced55449bdf2e83.
2019-12-09 12:44:05 +01:00

46 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import unicodedata
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ä',
'': 'ch',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ck',
'': 'll',
'': 'ö',
'': 'ſi',
'': 'ſt',
'': 'fi',
'': 'ff',
'': 'fl',
'': 'ffi',
'': 'ct',
'': '\'',
'': '-',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s