In [1]:
import numpy as np
import unicodedata
import inspect

# Levenshtein edit distance

dinglehopper uses to have its own (very inefficient) Levenshtein edit distance implementation, but now uses RapidFuzz.

In [2]:
from rapidfuzz.distance.Levenshtein import distance as levenshtein

In [3]:
assert levenshtein('a', 'a') == 0
assert levenshtein('a', 'b') == 1
assert levenshtein('Foo', 'Bar') == 3
assert levenshtein('', '') == 0
assert levenshtein('Foo', '') == 3
assert levenshtein('', 'Foo') == 3
assert levenshtein('Fnord', 'Food') == 2
assert levenshtein('M√ºll', 'Mull') == 1
assert levenshtein('Abstand', 'Sand') == 4

This fails for different representations of the "same" canonically equivalent string:

In [4]:
word1 = unicodedata.normalize('NFC', 'Schly√±')
word2 = unicodedata.normalize('NFD', 'Schly√±')  # Different, decomposed!
levenshtein(word1, word2)

2

In [5]:
# Same, but for grapheme clusters
from uniseg.graphemecluster import grapheme_clusters

word1 = list(grapheme_clusters(unicodedata.normalize('NFC', 'Schly√±')))
word2 = list(grapheme_clusters(unicodedata.normalize('NFD', 'Schly√±')))
levenshtein(word1, word2)

1

Better.

Let's define a edit distance function that uses the basic Levenshtein algorithm, but knows about Unicode normalization and grapheme clusters!

In [6]:
from qurator.dinglehopper.edit_distance import distance
print(inspect.getsource(distance))

@multimethod
def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode
    normalization and grapheme clusters. This should be the correct way to compare two
    Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
    return levenshtein(seq1, seq2)



In [7]:
word1 = unicodedata.normalize('NFC', 'Schly√±')
word2 = unicodedata.normalize('NFD', 'Schly√±')  # Different, decomposed!

distance(word1, word2)

0

This should give us the correct answer of 1 for 'Schly√±' (with LATIN SMALL LETTER N WITH TILDE) vs 'SchlymÃÉ' (with LATIN SMALL LETTER M + COMBINING TILDE):

In [8]:
word1 = 'Schly√±'
word2 = 'SchlymÃÉ'
#print('Lengths, as far as Python is concerned:', len(word1), len(word2))  # ‚Üí gives 6 and 7!
distance(word1, word2)

1

# Edit operations

python-Levenshtein + RapidFuzz supports backtracing, i.e. giving a sequence of edit options that transforms a word to another word:

In [9]:
from rapidfuzz.distance.Levenshtein import editops

editops('Foo', 'Fon')

[('replace', 2, 2)]

In [10]:
print(editops('K√§ptn', 'K√§pt\'n'))

[('insert', 4, 4)]


In [11]:
print(editops('Delete something', 'Deletesomething'))

[('delete', 6, 6)]


In [12]:
print(editops('A more difficult example', 'Amore diffic√ºlt  exampl'))

[('delete', 1, 1), ('replace', 13, 12), ('insert', 16, 15), ('delete', 23, 23)]


Let's try it with a difficult example that needs grapheme cluster handling:

In [13]:
word1 = 'Schly√±'  # with LATIN SMALL LETTER N WITH TILDE
word2 = 'SchlymÃÉ'  # with LATIN SMALL LETTER M + COMBINING TILDE

editops(word1, word2)

[('insert', 5, 5), ('replace', 5, 6)]

That doesn't look right, let's redefine it with grapheme cluster support:

In [14]:
from qurator.dinglehopper.edit_distance import editops
print(inspect.getsource(editops))

def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
    word1 = list(grapheme_clusters(unicodedata.normalize("NFC", word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize("NFC", word2)))
    return levenshtein_editops(word1, word2)



In [15]:
word1 = 'Schly√±'  # with LATIN SMALL LETTER N WITH TILDE
word2 = 'SchlymÃÉ'  # with LATIN SMALL LETTER M + COMBINING TILDE

editops(word1, word2)

[('replace', 5, 5)]

üéâ

Here, a problem is that the positions are grapheme cluster positions, not Python character indexes!

# Character error rate

[digitisation.eu](https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates) defines the character error rate (CER) as:

$$
\text{CER} = \frac{i + s + d}{n}
$$

where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)

Because our edit distance is equal to $i + s + d$, we can thus define:

In [16]:
from qurator.dinglehopper.character_error_rate import character_error_rate
print(inspect.getsource(character_error_rate))

def character_error_rate(reference, compared) -> float:
    """
    Compute character error rate.

    :return: character error rate
    """
    cer, _ = character_error_rate_n(reference, compared)
    return cer



In [17]:
assert character_error_rate('Foo', 'B√§r') == 3/3
assert character_error_rate('Fnord', 'Food') == 2/5
assert character_error_rate('Food', 'Fnord') == 2/4
assert character_error_rate('Schly√±', 'SchlymÃÉ') == 1/6

In [18]:
# From experiments/2019-07-ocrevalUAtion: These are already preprocessed by the equivalences in equivalences-tess-frk.csv.
gt = """115 √ºber die vielen Sorgen wegen de≈ø≈øelben verga√ü Hartkopf, der Frau Amtm√§nnin das ver‚∏ó ≈øprochene zu √ºberliefern. ‚Äî Ein Erpre≈ø≈øer wurde an ihn abge≈øchickt, um ihn ums Him‚∏ó melswillen zu ≈øagen, da√ü er das Ver≈øprochene gleich den Augenblick √ºberbringen m√∂chte, die Frau Amtm√§nnin h√§tte ≈øich auf ihn verla≈ø≈øen, und nun w√º√üte ≈øie nicht, was ≈øie anfangen ≈øollte. Den Augenblick ≈øollte er kommen, ≈øon≈øt vergieng ≈øie in ihrer Ang≈øt. ‚Äî Die G√§≈øte w√§ren ≈øchon angekommen, und es fehlte ihr doch noch an allem. ‚Äî Hartkopf mu√üte ≈øich er≈øt be≈øinnen, und endlich nach langem Nachdenken fiel es ihm er≈øt wieder ein. ‚Äî Er langte den Zettel aus dem Accisbuche heraus, und ≈øagte ≈øeiner Frau, da√ü ≈øie das, was da w√§re, herbey≈øchaffen m√∂chte. Jnde√ü mangelten doch einige Generalia, die al≈øo wegfielen. ‚Äî Hartkopf gieng ≈øelb≈øt mit und √ºberbrachte es. ‚Äî ‚ÄûHerr Jemine! er b√∂≈øer Mann!‚Äú ‚Äî ≈øchrie ihm die Frau Amtm√§nnin entgegen, und ≈øchlug ihn auf die Schulter und blickte den Korb, der voll gedr√ºckt, ger√ºttelt und √ºberÔ¨Ç√º≈ø≈øig in ihren Schoos gegeben werden ≈øollte, mit Augen voller Freu‚∏ó H 2"""
tess = """emm unmit; Lis √úbey die vielen Sorgen wegen" de≈ø≈øelben verga√ü Hartkopf, der Frau! Amim√§nnin das- ver ≈øprochene zu √ºberliefeen. ==" Ein Epypre≈ø≈øer- wurde an ihn abge≈øchieet', um' ihn ums Hime melswillen zu ≈øagen, "da√ü er das Ver≈øyrochene leich den Augenblick "√ºberbringen m√∂chte, die Frau Amtm√§nnin h√§tte ≈øich auf ihn veria≈ø≈øen, und nun w√º√üte ≈øie- nicht, was ≈øie anfangen ≈øollte, =! 'Den Augenblick ≈øollte "er kommen, ≈øon≈øt vergieng ≈øie in ihrer Ang≈øt. == Die S√§ua≈øie- w√§ren. ≈øchon angekommen, und es fehlte ihr do < noch an alien, === Hartfopyf mu√üte ≈øich er≈øt TIM und endlich mach langem Rachdenken fiel es ihm er≈øt wieder ein, ==. Ex langte den Zettel aus dem- Accisbuche heraus, und ≈øagte ≈øeiner Frau, da√ü ≈øie das , was da w√§re, herbey≈øchaffen mschte. ZIude√ü ‚Äûmangelten doch einige Generalia, die al≈øo wegfielen. == ' Havrkopf gieng ≈øelb≈øt mit und √ºberbrachte es == | ‚ÄûHerr Jemine! er b√∂≈øer Mann 1-2 ≈øchrie ihm die Frau Amtm√§nnin entgegen, und ≈øchlug ihn auf die Schulter und blickte den Korb, der - voll gedr√ºckt, ger√ºttelt und √ºberfirf≈øig in ihren Ss HEILE werden ≈øolite, mit Augen voller EE) Fron?"""

In [19]:
print('{:.4f}'.format(character_error_rate(gt, tess)))

0.1190


XXX This gives a smaller CER than ocrevalUAtion (which gives 0.1228). Why?

In [20]:
levenshtein(gt, tess)/len(gt)

0.1190253045923149

That's ~ the same, so I think it's not about the character segmentation. Check that we're only dealing with single-codepoint grapheme clusters:

In [21]:
for w in gt, tess:
    for g in grapheme_clusters(w):
        assert len(g) == 1

Maybe ocrevalUAtion doesn't count whitespace?

In [22]:
def remove_whitespace(s):
    return s.replace(' ', '')
remove_whitespace(gt)

'115√ºberdievielenSorgenwegende≈ø≈øelbenverga√üHartkopf,derFrauAmtm√§nnindasver‚∏ó≈øprochenezu√ºberliefern.‚ÄîEinErpre≈ø≈øerwurdeanihnabge≈øchickt,umihnumsHim‚∏ómelswillenzu≈øagen,da√üerdasVer≈øprochenegleichdenAugenblick√ºberbringenm√∂chte,dieFrauAmtm√§nninh√§tte≈øichaufihnverla≈ø≈øen,undnunw√º√üte≈øienicht,was≈øieanfangen≈øollte.DenAugenblick≈øollteerkommen,≈øon≈øtvergieng≈øieinihrerAng≈øt.‚ÄîDieG√§≈øtew√§ren≈øchonangekommen,undesfehlteihrdochnochanallem.‚ÄîHartkopfmu√üte≈øicher≈øtbe≈øinnen,undendlichnachlangemNachdenkenfielesihmer≈øtwiederein.‚ÄîErlangtedenZettelausdemAccisbucheheraus,und≈øagte≈øeinerFrau,da√ü≈øiedas,wasdaw√§re,herbey≈øchaffenm√∂chte.Jnde√ümangeltendocheinigeGeneralia,dieal≈øowegfielen.‚ÄîHartkopfgieng≈øelb≈øtmitund√ºberbrachtees.‚Äî‚ÄûHerrJemine!erb√∂≈øerMann!‚Äú‚Äî≈øchrieihmdieFrauAmtm√§nninentgegen,und≈øchlugihnaufdieSchulterundblicktedenKorb,dervollgedr√ºckt,ger√ºtteltund√ºberÔ¨Ç√º≈ø≈øiginihrenSchoosgegebenwerden≈øollte,mitAugenvollerFreu‚∏óH2'

In [23]:
print('{:.4f}'.format(character_error_rate(remove_whitespace(gt), remove_whitespace(tess))))

0.1324


Now it's larger than ocrevalUAtion ü§∑‚Äç‚ôÇÔ∏è

# Word error rate

## Word segmentation

Naively split on spaces.

(Note: ocrevalUAtion does confusing things here, like the Token splitting in a hash function, with an empty pattern?!)

In [24]:
def naive_word_split(s):
    return s.split(' ')

In [25]:
example_text = "The quick (‚Äúbrown‚Äù) fox can't jump 32.3 feet, right?"

In [26]:
naive_word_split(example_text)

['The',
 'quick',
 '(‚Äúbrown‚Äù)',
 'fox',
 "can't",
 'jump',
 '32.3',
 'feet,',
 'right?']

Let's do it the Unicode way (Appendix UAX #29 on Unicode Text Segmentation): Split on word boundaries using the uniseg libraries and ignore words that contain only whitespace, punctuation "and similar characters":

In [27]:
from qurator.dinglehopper.word_error_rate import words
print(inspect.getsource(words))

list(words(example_text))

@multimethod
def words(s: str):
    """Extract words from a string"""

    # Patch uniseg.wordbreak.word_break to deal with our private use characters. See also
    # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt
    old_word_break = uniseg.wordbreak.word_break

    def new_word_break(c, index=0):
        if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area
            return "ALetter"
        else:
            return old_word_break(c, index)

    uniseg.wordbreak.word_break = new_word_break

    # Check if c is an unwanted character, i.e. whitespace, punctuation, or similar
    def unwanted(c):

        # See https://www.fileformat.info/info/unicode/category/index.htm
        # and https://unicodebook.readthedocs.io/unicode.html#categories
        unwanted_categories = "O", "M", "P", "Z", "S"
        unwanted_subcategories = "Cc", "Cf"

        subcat = unicodedata.category(c)
        cat = subcat[0]
        return cat in unwanted_categories or subcat in 

['The', 'quick', 'brown', 'fox', "can't", 'jump', '32.3', 'feet', 'right']

In [28]:
list(words('Der schnelle [‚Äûbraune‚Äú] Fuchs kann keine 3,14 Meter springen, oder?'))

['Der',
 'schnelle',
 'braune',
 'Fuchs',
 'kann',
 'keine',
 '3,14',
 'Meter',
 'springen',
 'oder']

In [29]:
list(words('Dies ist ein Beispielsatz. Oh, ja.'))

['Dies', 'ist', 'ein', 'Beispielsatz', 'Oh', 'ja']

It's probably not correct for Chinese and Japanese, but at least it doesn't rely on spaces.

In [30]:
list(words('ÊàëÂæàÈ´òËààË∑ü‰Ω†Ë¶ãÈù¢'))  # "Pleased to meet you" in Mandarin, Traditional writing

['Êàë', 'Âæà', 'È´ò', 'Ëàà', 'Ë∑ü', '‰Ω†', 'Ë¶ã', 'Èù¢']

In [31]:
list(words('ÂåªËÄÖ„ÇíÂëº„Çì„Åß„Åè„Å†„Åï„ÅÑ„ÄÇ'))

['Âåª', 'ËÄÖ', '„Çí', 'Âëº', '„Çì', '„Åß', '„Åè', '„Å†', '„Åï', '„ÅÑ']

## Word error rate

For the word error rate, normalize again and compare sequences of words.

In [32]:
from qurator.dinglehopper.word_error_rate import word_error_rate
print(inspect.getsource(word_error_rate))

def word_error_rate(reference, compared) -> float:
    wer, _ = word_error_rate_n(reference, compared)
    return wer



In [33]:
word_error_rate('Dies ist ein Beispielsatz.', 'Dies isi ein Beispielsatz,')

0.25

In [34]:
word_error_rate('Fnord ist verdampfter Kr√§utertee!', 'Fn√≤rd ist verdmpfter Krautertee.')

0.75

In [35]:
word_error_rate(gt, tess)

0.18823529411764706

This is a little larger than the ocrevalUAtion result!