From 7c60e736e514ffb81b12f5ccd45e36c8fcdcaffa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mehmet=20Akal=C4=B1n?= <mehmet.akalin@ceng.metu.edu.tr>
Date: Mon, 10 Aug 2015 10:53:52 +0200
Subject: Add code to check for typos

---
 monkey/edits.py  | 29 ++++++++++++++++++++---------
 monkey/monkey.py | 25 +++++++++++++++++++++++--
 monkey/test.py   |  6 ++++++
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/monkey/edits.py b/monkey/edits.py
index 2dcd653..01e1ffd 100644
--- a/monkey/edits.py
+++ b/monkey/edits.py
@@ -40,7 +40,7 @@ def trace_graph(trace):
         code_next = action.apply(code)
 
         if action.type == 'test':
-            submissions.add(code)
+            submissions.add((code, action.total == action.passed))
             if action.total == action.passed:
                 done = True
 
@@ -172,6 +172,7 @@ def get_edits_from_traces(traces):
     edits = collections.Counter()
     submissions = collections.Counter()
     queries = collections.Counter()
+    names = collections.Counter()
 
     # Counts of traces where each line appears as a leaf / any node.
     n_leaf = collections.Counter()
@@ -179,11 +180,17 @@ def get_edits_from_traces(traces):
 
     for trace in traces:
         nodes, trace_submissions, trace_queries = trace_graph(trace)
-
+        counted_tokens = []
         # Update the submissions/queries counters (use normalized variables).
-        for submission in trace_submissions:
-            code = stringify(rename_vars(tokenize(submission)))
-            submissions[code] += 1
+        for (submission, correct) in trace_submissions:
+            if correct:
+                tokens = list(tokenize(submission))
+                for token in tokens:
+                    if token.type == 'NAME' and token.val not in counted_tokens:
+                        names[token.val] += 1
+                        counted_tokens.append(token.val)
+                code = stringify(rename_vars(tokens))
+                submissions[code] += 1
         for query in trace_queries:
             code = stringify(rename_vars(tokenize(query)))
             queries[code] += 1
@@ -215,7 +222,7 @@ def get_edits_from_traces(traces):
     for edit, p in edits.items():
         edits[edit] = logistic(p, k=3, x_0=avg_p)
 
-    return edits, submissions, queries
+    return edits, submissions, queries, names
 
 def classify_edits(edits):
     inserts = {}
@@ -244,13 +251,17 @@ if __name__ == '__main__':
     edits = {}
     submissions = {}
     queries = {}
+    names = {}
     for problem in Problem.objects.all():
         print(problem.name)
         pid = problem.pk
         attempts = Attempt.objects.filter(problem=problem, done=True) \
                                   .exclude(user__groups=None)
         traces = [a.trace for a in attempts]
-        if traces:
-            edits[pid], submissions[pid], queries[pid] = get_edits_from_traces(traces)
+        try:
+            edits[pid], submissions[pid], queries[pid], names[pid] = get_edits_from_traces(traces)
+        except:
+            pass
+
+    pickle.dump((edits, submissions, queries, names), open('edits.pickle', 'wb'))
 
-    pickle.dump((edits, submissions, queries), open('edits.pickle', 'wb'))
diff --git a/monkey/monkey.py b/monkey/monkey.py
index 99f5a2a..02048de 100755
--- a/monkey/monkey.py
+++ b/monkey/monkey.py
@@ -2,11 +2,11 @@
 
 import math
 import time
-
 import prolog.engine
+
 from .edits import classify_edits
 from prolog.util import Token, annotate, compose, map_vars, normalized, rename_vars, stringify
-from .util import PQueue
+from .util import damerau_levenshtein, PQueue
 
 # Check whether all tests for problem [name] succeed.
 def test(name, code):
@@ -311,3 +311,24 @@ def fix_hints(code, path):
 
         program[idx:idx+len(a)] = [t.clone(pos=program[idx].pos) for t in b]
         yield fix_type, start, end, msg
+
+
+# Checks for typos in the code and suggest the nearst uploaded term by other users.
+def check_typos(code, names):
+    for token in annotate(code):
+        if token.type == 'NAME':
+            nearest_name = ' '
+            nearest_dist = 1000
+            own_count = names.get(token.val, 0) # count of the token.val which is compared with the
+                                                # each name in the names
+            for name in names.items():
+                if name[0] == token.val:        # If the names are the skip the code
+                    continue
+
+                distance = damerau_levenshtein(token.val, name[0])
+
+                if distance < nearest_dist and distance > 0  and own_count < name[1]:
+                    nearest_dist = distance     # Set best_dist and best_name if the less one is found
+                    nearest_name = name[0]
+            if nearest_dist > 0 and nearest_dist/len(nearest_name) <= 1/3:
+                yield  'typo', token.pos, token.pos + len(token.val) , 'Did you mean "{}"?'.format(nearest_name)
diff --git a/monkey/test.py b/monkey/test.py
index 6549c1b..bb28e9b 100755
--- a/monkey/test.py
+++ b/monkey/test.py
@@ -36,6 +36,7 @@ attempts = Attempt.objects.filter(problem=problem) \
 edits = tutor_apps.get_app_config('tutor').edits[problem.pk]
 submissions = tutor_apps.get_app_config('tutor').submissions[problem.pk]
 queries = tutor_apps.get_app_config('tutor').queries[problem.pk]
+names = tutor_apps.get_app_config('tutor').names[problem.pk]
 
 # Find incorrect submissions.
 incorrect_all = []
@@ -135,6 +136,11 @@ elif sys.argv[2] == 'info':
         for (before, after), cost in sorted(changes.items(), key=lambda x: x[1]):
             print(' {:.4f}\t{} → {}'.format(cost, stringify(before) if before else 'ε',
                                                   stringify(after) if after else 'ε'))
+    # Print all observed edits and their costs.
+    elif sys.argv[3] == 'names':
+        for name, count in sorted(names.items(), key=lambda x: x[1]):
+            print(' {:.4f}\t{}'.format(count,  name))
+
     # Print all student submissions not (yet) corrected.
     elif sys.argv[3] == 'unsolved':
         for p in sorted(incorrect):
-- 
cgit v1.2.1