diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py index 0e8ee38..8e58101 100644 --- a/qurator/dinglehopper/__init__.py +++ b/qurator/dinglehopper/__init__.py @@ -1,5 +1,5 @@ from .ocr_files import * -from .substitute_equivalences import * +from .extracted_text import * from .character_error_rate import * from .word_error_rate import * from .align import * diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 6b1f62f..e873ebd 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -7,8 +7,6 @@ from typing import Optional import attr -from .substitute_equivalences import substitute_equivalences - class Normalization(enum.Enum): NFC = 1 @@ -32,6 +30,62 @@ def normalize_sbb(t): return normalize(t, Normalization.NFC_SBB) +def unjoin_ligatures(s): + """Unjoin ligatures, i.e. ff becomes ff.""" + + equivalences = { + '': 'ſſ', + "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I + '': 'ch', + '': 'ck', + '': 'll', + '': 'ſi', + '': 'ſt', + 'fi': 'fi', + 'ff': 'ff', + 'fl': 'fl', + 'ffi': 'ffi', + '': 'ct', + '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ + '\uf532': 'as', # eMOP: Latin small ligature as + '\uf533': 'is', # eMOP: Latin small ligature is + '\uf534': 'us', # eMOP: Latin small ligature us + '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u + 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ + '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? + '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P + 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST + } + s = unicodedata.normalize('NFC', s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s + + +def substitute_equivalences(s): + # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR + # It might make sense to use different rules for GT and for the different OCR + equivalences = { + '': 'ü', + '': 'ä', + '==': '–', # → en-dash + '—': '–', # em-dash → en-dash + '': 'ö', + '’': '\'', + '⸗': '-', + 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E + 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E + 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E + '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT + } + + s = unicodedata.normalize('NFC', s) + s = unjoin_ligatures(s) + for fr, to in equivalences.items(): + s = s.replace(fr, to) + return s + + @attr.s(frozen=True) class ExtractedText: """ diff --git a/qurator/dinglehopper/substitute_equivalences.py b/qurator/dinglehopper/substitute_equivalences.py deleted file mode 100644 index 39be276..0000000 --- a/qurator/dinglehopper/substitute_equivalences.py +++ /dev/null @@ -1,57 +0,0 @@ -import unicodedata - - -def unjoin_ligatures(s): - """Unjoin ligatures, i.e. ff becomes ff.""" - - equivalences = { - '': 'ſſ', - "\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I - '': 'ch', - '': 'ck', - '': 'll', - '': 'ſi', - '': 'ſt', - 'fi': 'fi', - 'ff': 'ff', - 'fl': 'fl', - 'ffi': 'ffi', - '': 'ct', - '': 'tz', # MUFI: LATIN SMALL LIGATURE TZ - '\uf532': 'as', # eMOP: Latin small ligature as - '\uf533': 'is', # eMOP: Latin small ligature is - '\uf534': 'us', # eMOP: Latin small ligature us - '\uf535': 'Qu', # eMOP: Latin ligature capital Q small u - 'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ - '\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly? - '\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P - 'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST - } - s = unicodedata.normalize('NFC', s) - for fr, to in equivalences.items(): - s = s.replace(fr, to) - return s - - -def substitute_equivalences(s): - # These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR - # It might make sense to use different rules for GT and for the different OCR - equivalences = { - '': 'ü', - '': 'ä', - '==': '–', # → en-dash - '—': '–', # em-dash → en-dash - '': 'ö', - '’': '\'', - '⸗': '-', - 'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E - 'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E - 'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E - '\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT - } - - s = unicodedata.normalize('NFC', s) - s = unjoin_ligatures(s) - for fr, to in equivalences.items(): - s = s.replace(fr, to) - return s