mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-07-15 21:39:57 +02:00
🎨 dinglehopper: Unfuck substitutions a bit
This commit is contained in:
parent
e3e7938162
commit
6eb0a9350c
1 changed files with 26 additions and 15 deletions
|
@ -1,21 +1,15 @@
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
def substitute_equivalences(s):
|
def unjoin_ligatures(s):
|
||||||
|
"""Unjoin ligatures, i.e. ff becomes ff."""
|
||||||
|
|
||||||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
|
||||||
# It might make sense to use different rules for GT and for the different OCR
|
|
||||||
equivalences = {
|
equivalences = {
|
||||||
'': 'ü',
|
|
||||||
'': 'ſſ',
|
'': 'ſſ',
|
||||||
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||||
'': 'ä',
|
|
||||||
'': 'ch',
|
'': 'ch',
|
||||||
'==': '–', # → en-dash
|
|
||||||
'—': '–', # em-dash → en-dash
|
|
||||||
'': 'ck',
|
'': 'ck',
|
||||||
'': 'll',
|
'': 'll',
|
||||||
'': 'ö',
|
|
||||||
'': 'ſi',
|
'': 'ſi',
|
||||||
'': 'ſt',
|
'': 'ſt',
|
||||||
'fi': 'fi',
|
'fi': 'fi',
|
||||||
|
@ -23,12 +17,7 @@ def substitute_equivalences(s):
|
||||||
'fl': 'fl',
|
'fl': 'fl',
|
||||||
'ffi': 'ffi',
|
'ffi': 'ffi',
|
||||||
'': 'ct',
|
'': 'ct',
|
||||||
'’': '\'',
|
|
||||||
'⸗': '-',
|
|
||||||
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
||||||
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
|
||||||
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
|
||||||
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
|
||||||
'\uf532': 'as', # eMOP: Latin small ligature as
|
'\uf532': 'as', # eMOP: Latin small ligature as
|
||||||
'\uf533': 'is', # eMOP: Latin small ligature is
|
'\uf533': 'is', # eMOP: Latin small ligature is
|
||||||
'\uf534': 'us', # eMOP: Latin small ligature us
|
'\uf534': 'us', # eMOP: Latin small ligature us
|
||||||
|
@ -37,10 +26,32 @@ def substitute_equivalences(s):
|
||||||
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
||||||
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||||
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
||||||
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
|
||||||
}
|
}
|
||||||
|
|
||||||
s = unicodedata.normalize('NFC', s)
|
s = unicodedata.normalize('NFC', s)
|
||||||
for fr, to in equivalences.items():
|
for fr, to in equivalences.items():
|
||||||
s = s.replace(fr, to)
|
s = s.replace(fr, to)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def substitute_equivalences(s):
|
||||||
|
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||||
|
# It might make sense to use different rules for GT and for the different OCR
|
||||||
|
equivalences = {
|
||||||
|
'': 'ü',
|
||||||
|
'': 'ä',
|
||||||
|
'==': '–', # → en-dash
|
||||||
|
'—': '–', # em-dash → en-dash
|
||||||
|
'': 'ö',
|
||||||
|
'’': '\'',
|
||||||
|
'⸗': '-',
|
||||||
|
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||||
|
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||||
|
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||||
|
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||||
|
}
|
||||||
|
|
||||||
|
s = unicodedata.normalize('NFC', s)
|
||||||
|
s = unjoin_ligatures(s)
|
||||||
|
for fr, to in equivalences.items():
|
||||||
|
s = s.replace(fr, to)
|
||||||
|
return s
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue