From 8ad7bbd2e7627a99baec6e998d0cf289eef765d0 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Thu, 26 Feb 2015 13:22:17 +0100 Subject: Add a function for Damerau-Levenshtein distance Will be used to check for typos. --- monkey/util.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/monkey/util.py b/monkey/util.py index 9648926..93c6d94 100644 --- a/monkey/util.py +++ b/monkey/util.py @@ -61,3 +61,44 @@ def avg(l): # Logistic function. def logistic(x, L=1, k=1, x_0=0): return L/(1+math.exp(-k*(x-x_0))) + +# Damerau-Levenshtein distance between sequences [a] and [b]. Based on +# https://thetaiko.wordpress.com/2011/01/21/damerau-levenshtein-distance-in-python/ +def damerau_levenshtein(a, b): + # Construct an alphabet out of characters in [a] and [b] with a sequential + # identifier for each word. + alphabet = {} + for char in itertools.chain(a, b): + if char not in alphabet: + alphabet[char] = len(alphabet) + + # Initialize distance matrix. + inf = len(a) + len(b) + 1 + h = [[0]*(len(b)+2) for i in range(len(a)+2)] + h[0][0] = inf + for i in range(0, len(a)+1): + h[i+1][1] = i + h[i+1][0] = inf + for j in range(0, len(b)+1): + h[1][j+1] = j + h[0][j+1] = inf + + # Do the magic. + da = [0 for x in range(0, len(alphabet))] + for i in range(1, len(a)+1): + db = 0 + for j in range(1, len(b)+1): + i1 = da[alphabet[b[j-1]]] + j1 = db + d = 1 + if (a[i-1] == b[j-1]): + d = 0 + db = j + h[i+1][j+1] = min( + h[i][j] + d, # substitution + h[i+1][j] + 1, # insertion + h[i][j+1] + 1, # deletion + h[i1][j1] + (i-i1-1)+1+(j-j1-1)) # transposition + da[alphabet[a[i-1]]] = i + + return h[len(a)+1][len(b)+1] -- cgit v1.2.1