mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-07 19:05:13 +02:00
🧹 dinglehopper: Move all normalization code to extracted_text.py
This commit is contained in:
parent
009fa55c09
commit
82217a25bb
3 changed files with 57 additions and 60 deletions
|
@ -1,5 +1,5 @@
|
|||
from .ocr_files import *
|
||||
from .substitute_equivalences import *
|
||||
from .extracted_text import *
|
||||
from .character_error_rate import *
|
||||
from .word_error_rate import *
|
||||
from .align import *
|
||||
|
|
|
@ -7,8 +7,6 @@ from typing import Optional
|
|||
|
||||
import attr
|
||||
|
||||
from .substitute_equivalences import substitute_equivalences
|
||||
|
||||
|
||||
class Normalization(enum.Enum):
|
||||
NFC = 1
|
||||
|
@ -32,6 +30,62 @@ def normalize_sbb(t):
|
|||
return normalize(t, Normalization.NFC_SBB)
|
||||
|
||||
|
||||
def unjoin_ligatures(s):
|
||||
"""Unjoin ligatures, i.e. ff becomes ff."""
|
||||
|
||||
equivalences = {
|
||||
'': 'ſſ',
|
||||
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||
'': 'ch',
|
||||
'': 'ck',
|
||||
'': 'll',
|
||||
'': 'ſi',
|
||||
'': 'ſt',
|
||||
'fi': 'fi',
|
||||
'ff': 'ff',
|
||||
'fl': 'fl',
|
||||
'ffi': 'ffi',
|
||||
'': 'ct',
|
||||
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
||||
'\uf532': 'as', # eMOP: Latin small ligature as
|
||||
'\uf533': 'is', # eMOP: Latin small ligature is
|
||||
'\uf534': 'us', # eMOP: Latin small ligature us
|
||||
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
|
||||
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
|
||||
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
||||
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
||||
}
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
||||
|
||||
|
||||
def substitute_equivalences(s):
|
||||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||
# It might make sense to use different rules for GT and for the different OCR
|
||||
equivalences = {
|
||||
'': 'ü',
|
||||
'': 'ä',
|
||||
'==': '–', # → en-dash
|
||||
'—': '–', # em-dash → en-dash
|
||||
'': 'ö',
|
||||
'’': '\'',
|
||||
'⸗': '-',
|
||||
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||
}
|
||||
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = unjoin_ligatures(s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
||||
|
||||
|
||||
@attr.s(frozen=True)
|
||||
class ExtractedText:
|
||||
"""
|
||||
|
|
|
@ -1,57 +0,0 @@
|
|||
import unicodedata
|
||||
|
||||
|
||||
def unjoin_ligatures(s):
|
||||
"""Unjoin ligatures, i.e. ff becomes ff."""
|
||||
|
||||
equivalences = {
|
||||
'': 'ſſ',
|
||||
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
|
||||
'': 'ch',
|
||||
'': 'ck',
|
||||
'': 'll',
|
||||
'': 'ſi',
|
||||
'': 'ſt',
|
||||
'fi': 'fi',
|
||||
'ff': 'ff',
|
||||
'fl': 'fl',
|
||||
'ffi': 'ffi',
|
||||
'': 'ct',
|
||||
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
|
||||
'\uf532': 'as', # eMOP: Latin small ligature as
|
||||
'\uf533': 'is', # eMOP: Latin small ligature is
|
||||
'\uf534': 'us', # eMOP: Latin small ligature us
|
||||
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
|
||||
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
|
||||
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
|
||||
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
|
||||
'st': 'st', # U+FB06 LATIN SMALL LIGATURE ST
|
||||
}
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
||||
|
||||
|
||||
def substitute_equivalences(s):
|
||||
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
|
||||
# It might make sense to use different rules for GT and for the different OCR
|
||||
equivalences = {
|
||||
'': 'ü',
|
||||
'': 'ä',
|
||||
'==': '–', # → en-dash
|
||||
'—': '–', # em-dash → en-dash
|
||||
'': 'ö',
|
||||
'’': '\'',
|
||||
'⸗': '-',
|
||||
'aͤ': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
|
||||
'oͤ': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
|
||||
'uͤ': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
|
||||
'\uF50E': 'q́' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
|
||||
}
|
||||
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = unjoin_ligatures(s)
|
||||
for fr, to in equivalences.items():
|
||||
s = s.replace(fr, to)
|
||||
return s
|
Loading…
Add table
Add a link
Reference in a new issue