summaryrefslogtreecommitdiff
path: root/monkey
diff options
context:
space:
mode:
authorTimotej Lazar <timotej.lazar@araneo.org>2015-02-26 13:22:17 +0100
committerAleš Smodiš <aless@guru.si>2015-08-11 14:26:02 +0200
commit8ad7bbd2e7627a99baec6e998d0cf289eef765d0 (patch)
tree282dcf4dfc9ef4d2c79a524e97e7d312d3e3d17c /monkey
parent3d73ab2b861f0178001071a7e35c726be46f82ce (diff)
Add a function for Damerau-Levenshtein distance
Will be used to check for typos.
Diffstat (limited to 'monkey')
-rw-r--r--monkey/util.py41
1 files changed, 41 insertions, 0 deletions
diff --git a/monkey/util.py b/monkey/util.py
index 9648926..93c6d94 100644
--- a/monkey/util.py
+++ b/monkey/util.py
@@ -61,3 +61,44 @@ def avg(l):
# Logistic function.
def logistic(x, L=1, k=1, x_0=0):
return L/(1+math.exp(-k*(x-x_0)))
+
+# Damerau-Levenshtein distance between sequences [a] and [b]. Based on
+# https://thetaiko.wordpress.com/2011/01/21/damerau-levenshtein-distance-in-python/
+def damerau_levenshtein(a, b):
+ # Construct an alphabet out of characters in [a] and [b] with a sequential
+ # identifier for each word.
+ alphabet = {}
+ for char in itertools.chain(a, b):
+ if char not in alphabet:
+ alphabet[char] = len(alphabet)
+
+ # Initialize distance matrix.
+ inf = len(a) + len(b) + 1
+ h = [[0]*(len(b)+2) for i in range(len(a)+2)]
+ h[0][0] = inf
+ for i in range(0, len(a)+1):
+ h[i+1][1] = i
+ h[i+1][0] = inf
+ for j in range(0, len(b)+1):
+ h[1][j+1] = j
+ h[0][j+1] = inf
+
+ # Do the magic.
+ da = [0 for x in range(0, len(alphabet))]
+ for i in range(1, len(a)+1):
+ db = 0
+ for j in range(1, len(b)+1):
+ i1 = da[alphabet[b[j-1]]]
+ j1 = db
+ d = 1
+ if (a[i-1] == b[j-1]):
+ d = 0
+ db = j
+ h[i+1][j+1] = min(
+ h[i][j] + d, # substitution
+ h[i+1][j] + 1, # insertion
+ h[i][j+1] + 1, # deletion
+ h[i1][j1] + (i-i1-1)+1+(j-j1-1)) # transposition
+ da[alphabet[a[i-1]]] = i
+
+ return h[len(a)+1][len(b)+1]