🧹 dinglehopper: Move all normalization code to extracted_text.py

pull/38/head
Gerber, Mike 4 years ago
parent 009fa55c09
commit 82217a25bb

@ -1,5 +1,5 @@
from .ocr_files import *
from .substitute_equivalences import *
from .extracted_text import *
from .character_error_rate import *
from .word_error_rate import *
from .align import *

@ -7,8 +7,6 @@ from typing import Optional
import attr
from .substitute_equivalences import substitute_equivalences
class Normalization(enum.Enum):
NFC = 1
@ -32,6 +30,62 @@ def normalize_sbb(t):
return normalize(t, Normalization.NFC_SBB)
def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
equivalences = {
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ch',
'': 'ck',
'': 'll',
'': 'ſi',
'': 'ſt',
'': 'fi',
'': 'ff',
'': 'fl',
'': 'ffi',
'': 'ct',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ä',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ö',
'': '\'',
'': '-',
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
@attr.s(frozen=True)
class ExtractedText:
"""

@ -1,57 +0,0 @@
import unicodedata
def unjoin_ligatures(s):
"""Unjoin ligatures, i.e. ff becomes ff."""
equivalences = {
'': 'ſſ',
"\ueba7": 'ſſi', # MUFI: LATIN SMALL LIGATURE LONG S LONG S I
'': 'ch',
'': 'ck',
'': 'll',
'': 'ſi',
'': 'ſt',
'': 'fi',
'': 'ff',
'': 'fl',
'': 'ffi',
'': 'ct',
'': 'tz', # MUFI: LATIN SMALL LIGATURE TZ
'\uf532': 'as', # eMOP: Latin small ligature as
'\uf533': 'is', # eMOP: Latin small ligature is
'\uf534': 'us', # eMOP: Latin small ligature us
'\uf535': 'Qu', # eMOP: Latin ligature capital Q small u
'ij': 'ij', # U+0133 LATIN SMALL LIGATURE IJ
'\uE8BF': 'q&', # MUFI: LATIN SMALL LETTER Q LIGATED WITH FINAL ET XXX How to replace this correctly?
'\uEBA5': 'ſp', # MUFI: LATIN SMALL LIGATURE LONG S P
'': 'st', # U+FB06 LATIN SMALL LIGATURE ST
}
s = unicodedata.normalize('NFC', s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
def substitute_equivalences(s):
# These are for OCR-D GT vs Tesseract frk vs Calamari GT4HistOCR
# It might make sense to use different rules for GT and for the different OCR
equivalences = {
'': 'ü',
'': 'ä',
'==': '', # → en-dash
'': '', # em-dash → en-dash
'': 'ö',
'': '\'',
'': '-',
'': 'ä', # LATIN SMALL LETTER A, COMBINING LATIN SMALL LETTER E
'': 'ö', # LATIN SMALL LETTER O, COMBINING LATIN SMALL LETTER E
'': 'ü', # LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
'\uF50E': '' # U+F50E LATIN SMALL LETTER Q WITH ACUTE ACCENT
}
s = unicodedata.normalize('NFC', s)
s = unjoin_ligatures(s)
for fr, to in equivalences.items():
s = s.replace(fr, to)
return s
Loading…
Cancel
Save