1
0
Fork 0
mirror of https://github.com/qurator-spk/dinglehopper.git synced 2025-07-06 00:49:59 +02:00
dinglehopper/qurator/dinglehopper/substitute_equivalences.py

57 lines
2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import unicodedata
def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
equivalences = {
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ch',
'': 'ck',
'': 'll',
'': 'ſi',
'': 'ſt',
'': 'fi',
'': 'ff',
'': 'fl',
'': 'ffi',
'': 'ct',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ä',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ö',
'': '\'',
'': '-',
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s