1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-03 23:49:57 +02:00
dinglehopper/qurator/dinglehopper/substitute_equivalences.py
Gerber, Mike 48a31ce672 Revert "Merge branch 'master' of https://github.com/qurator-spk/sbb_textline_detector"
This reverts commit 2c89bf3b35ee290d7b830ef270df3a96aa48245e, reversing
changes made to 9f7e413148ca5dbac9b555d7b0d0a5fa3a0f5340.
2019-12-09 12:44:05 +01:00

46 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import unicodedata
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ä',
'': 'ch',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ck',
'': 'll',
'': 'ö',
'': 'ſi',
'': 'ſt',
'': 'fi',
'': 'ff',
'': 'fl',
'': 'ffi',
'': 'ct',
'': '\'',
'': '-',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s