Use a more general method for extracting edits

This is a large overhaul of monkey code. Before, only edits within individual lines were tracked, which required a Prolog-specific method for splitting a program into a list of lines for every rule. In this version, modifications can be tracked within arbitrary code ranges. Ranges to be tracked are determined by selecting "interesting" subtrees in the AST of the starting code version. The new method is simpler, less language-dependent and easier to extend. The downside is that a program must be syntactically correct before we can attempt to fix it (the previous approach could handle programs with syntax errors in some cases). This commit also integrates a call to monkey.fix in prolog_session.hint, by running it if no other hint is found.
author: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2015-11-18 13:20:00 +0100
committer: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2015-12-11 16:11:15 +0100
commit: fa39fe7bfedd0b2e615d369adb5b510ceb9b857f (patch)
tree: abf381b007d1816247cab534ac8e2152695ad596 /monkey/edits.py
parent: dd723bd01634fa5ffc85402ea10947e472b257af (diff)
1 files changed, 142 insertions, 190 deletions
diff --git a/monkey/edits.py b/monkey/edits.py
index 17f13d3..7744fa2 100644
--- a/monkey/edits.py
+++ b/monkey/edits.py
@@ -19,26 +19,20 @@ import math
 
 from .action import expand, parse
 from .graph import Node
-from prolog.util import annotate, normalized, rename_vars, stringify, tokenize
+from prolog.util import Token, normalized, parse as prolog_parse, rename_vars, rename_vars_ast2, rename_vars_list, interesting_ranges, stringify, tokenize
 from .util import get_line, avg, logistic
 
-# Parse the sequence of actions in [trace] and return a directed acyclic graph
-# representing development history. Each node represents a particular version
-# of some line, and each edge represents a "line edit" (contiguous sequence of
-# inserts/removes within a single line).
-# Return a list of nodes with root as the first element. Also return sets of
-# submissions (user-tested program versions) and queries in this attempt.
-def trace_graph(trace):
-    # Return values.
-    nodes = [Node([0, 0, ()])]  # Node data: rank (Y), line no. (X), and tokens.
+def get_edits_from_trace(trace, test, id):
     submissions = set()         # Program versions at 'test' actions.
     queries = set()             # Queries run by the student.
 
-    # State variables.
-    leaves = {0: nodes[0]}  # Current leaf node for each line.
-    rank = 1                # Rank (order / y-position) for the next node.
-    code_next = ''          # Program code after applying the current action.
-    done = False            # Set to True on first correct version.
+    # For each observed edit, store a list of features (with repeats) of ASTs
+    # where they were observed.
+    edits = collections.defaultdict(list)
+    def add_edit(path, start, end, tree):
+        if start == end:
+            return
+        edits[(path, start, end)].append(id)
 
     # Parse trace actions and ensure there is a separate action for each
     # inserted/removed character.
@@ -49,194 +43,139 @@ def trace_graph(trace):
         # Only a few traces fail to parse, so just skip them.
         actions = []
 
+    # State variables.
+    open_edits = []
+    code_next = ''    # Program code after applying the current action.
+    done = False      # Set to True on first correct version.
+    prev_tree = None
+    prev_action = None
+
     for action_id, action in enumerate(actions):
         code = code_next
         code_next = action.apply(code)
 
-        if action.type == 'test':
-            submissions.add((code, action.total == action.passed))
-            if action.total == action.passed:
-                done = True
-
-        elif action.type == 'solve' or action.type == 'solve_all':
-            queries.add(action.query.rstrip(' .'))
-
-        elif action.type == 'insert' or action.type == 'remove':
-            # Ignore edits after the first correct version.
-            if done:
-                continue
-
-            # Number of the changed line.
-            line = code[:action.offset].count('\n')
-            # Current leaf node for this line.
-            parent = leaves[line]
-            # Tokens in this line after applying [action].
-            tokens_next = tuple(tokenize(get_line(code_next, line)))
-
-            # If a new node is inserted, clone each leaf into the next rank.
-            # This makes it easier to print the graph for graphviz; when
-            # analyzing the graph, duplicate nodes without siblings should be
-            # ignored.
-            new_leaves = {}
-
-            if action.text == '\n':
-                if action.type == 'insert':
-                    tokens_next_right = tuple(tokenize(get_line(code_next, line+1)))
-
-                    child_left = Node([rank, line, tokens_next])
-                    parent.add_out(child_left)
-
-                    child_right = Node([rank, line+1, tokens_next_right])
-                    parent.add_out(child_right)
-
-                    # Create new leaf nodes.
-                    for i, leaf in leaves.items():
-                        if i < line:
-                            new_leaves[i] = Node([rank, i, leaf.data[2]])
-                            leaf.add_out(new_leaves[i])
-                        elif i > line:
-                            new_leaves[i+1] = Node([rank, i+1, leaf.data[2]])
-                            leaf.add_out(new_leaves[i+1])
-                    new_leaves[line] = child_left
-                    new_leaves[line+1] = child_right
-
-                elif action.type == 'remove':
-                    parent_right = leaves[line+1]
-
-                    child = Node([rank, line, tokens_next])
-                    parent_right.add_out(child)
-                    parent.add_out(child)
-
-                    # Create new leaf nodes.
-                    for i, leaf in leaves.items():
-                        if i < line:
-                            new_leaves[i] = Node([rank, i, leaf.data[2]])
-                            leaf.add_out(new_leaves[i])
-                        elif i > line+1:
-                            new_leaves[i-1] = Node([rank, i-1, leaf.data[2]])
-                            leaf.add_out(new_leaves[i-1])
-                    new_leaves[line] = child
-            else:
-                # Skip the node if the next action is insert/remove (except \n)
-                # on the same line.
-                if action_id < len(actions)-1:
-                    action_next = actions[action_id+1]
-                    if action_next.type in ('insert', 'remove'):
-                        line_next = code_next[:action_next.offset].count('\n')
-                        if action_next.text != '\n' and line == line_next:
-                            continue
-
-                # Skip the node if it is the same as the parent.
-                if tokens_next == parent.data[2]:
-                    continue
-
-                child = Node([rank, line, tokens_next])
-                parent.add_out(child)
-
-                # Create new leaf nodes.
-                for i, leaf in leaves.items():
-                    if i != line:
-                        new_leaves[i] = Node([rank, i, leaf.data[2]])
-                        leaf.add_out(new_leaves[i])
-                new_leaves[line] = child
-
-            leaves = new_leaves
-            nodes += leaves.values()
-            rank += 1
-
-    return nodes, submissions, queries
-
-# Return a set of edits that appear in the trace_graph given by [nodes].
-def graph_edits(nodes):
-    edits = set()
-    for a in nodes:
-        a_data = a.data[2]
-        for b in a.eout:
-            b_data = b.data[2]
-
-            # Normalize a → b into start → end. Reuse variable names from a
-            # when normalizing b.
-            var_names = {}
-            start = normalized(a_data, var_names)
-            end = normalized(b_data, var_names)
-
-            if start == end:
-                continue
-
-            # An edit start → ε happens each time the user inserts \n; ignore
-            # such cases.
-            if not end and len(a.eout) > 1:
-                continue
-
-            # Skip edits where start/end is composed of more than one part.
-            # TODO improve trace_graph to handle this instead.
-            if [t for t in annotate(stringify(start)) if t.part > 0]:
-                continue
-            if [t for t in annotate(stringify(end)) if t.part > 0]:
-                continue
-
-            edits.add((start, end))
-    return edits
-
-# Build an edit graph for each trace and find "meaningful" (to be defined)
-# edits. Return a dictionary of edits and their frequencies, and also
-# submissions and queries in [traces].
-def get_edits_from_traces(traces):
-    # Return values: counts for observed edits, lines, submissions and queries.
-    edits = collections.Counter()
+        if action.type in {'solve', 'solve_all', 'test'}:
+            if action.type == 'solve' or action.type == 'solve_all':
+                queries.add(action.query.rstrip(' .'))
+            elif action.type == 'test':
+                correct = test(code)
+                submissions.add((code, correct))
+                if correct:
+                    # Ignore actions after the first correct version.
+                    done = True
+                    break
+
+            tree = prolog_parse(code)
+            if tree and tree.leaves() and tree != prev_tree:
+                for terminals, path in interesting_ranges(tree):
+                    pos_start = terminals[0].pos
+                    pos_end = terminals[-1].pos + len(terminals[-1].val)
+                    # If there is an open edit with the same range, don't add a new one.
+                    found = False
+                    for e_start_tree, e_start_tokens, e_path, e_pos_start, e_pos_end in open_edits:
+                        if e_pos_start == pos_start and e_pos_end == pos_end:
+                            found = True
+                            break
+                    if not found:
+                        #print('OPENING {}'.format(terminals))
+                        open_edits.append([tree, terminals, path, pos_start, pos_end])
+            prev_tree = tree
+
+        if action.type in {'insert', 'remove'}:
+            new_open_edits = []
+            for start_tree, start_tokens, path, pos_start, pos_end in open_edits:
+                new_pos_start, new_pos_end = pos_start, pos_end
+                if action.type == 'remove':
+                    if action.offset < pos_end:
+                        new_pos_end -= 1
+                        if action.offset < pos_start:
+                            new_pos_start -= 1
+                elif action.type == 'insert':
+                    if action.offset < pos_start:
+                        new_pos_start += 1
+                        new_pos_end += 1
+                    elif action.offset == pos_start:
+                        new_pos_end += 1
+                    elif action.offset < pos_end:
+                        new_pos_end += 1
+                    elif action.offset == pos_end:
+                        if (prev_action is None or
+                                prev_action.type == 'insert' and prev_action.offset == action.offset-1 or
+                                prev_action.type == 'remove' and prev_action.offset == action.offset-1):
+                            orig_next = None
+                            for terminal in start_tree.leaves():
+                                if terminal.pos >= start_tokens[-1].pos + len(start_tokens[-1].val):
+                                    orig_next = terminal
+                                    break
+                            if not (orig_next and orig_next.val[0] == action.text):
+                                new_pos_end += 1
+                        pass
+                if new_pos_start != new_pos_end:
+                    new_open_edits.append([start_tree, start_tokens, path, new_pos_start, new_pos_end])
+            open_edits = new_open_edits
+            prev_action = action
+
+    if done:
+        for start_tree, start_tokens, path, pos_start, pos_end in open_edits:
+            end_tokens = tokenize(code[pos_start:pos_end])
+            names = {}
+            start_normal = rename_vars_list(start_tokens, names)
+            end_normal = rename_vars_list(end_tokens, names)
+            norm_tree = rename_vars_ast2(start_tree, names)
+            add_edit(path, tuple(start_normal), tuple(end_normal), norm_tree)
+
+    return edits, submissions, queries
+
+def get_edits_from_solutions(solutions, test):
+    # For each observed edit, store a list of features (with repeats) of ASTs
+    # where they were observed.
     submissions = collections.Counter()
     queries = collections.Counter()
-    names = collections.Counter()
+    edits = collections.defaultdict(list)
+
+    for solution in solutions:
+        trace = solution.trace
+        uid = solution.codeq_user_id
+        trace_edits, trace_submissions, trace_queries = get_edits_from_trace(trace, test, uid)
 
-    # Counts of traces where each line appears as a leaf / any node.
-    n_leaf = collections.Counter()
-    n_all = collections.Counter()
+        # Update edits.
+        for edit, features in trace_edits.items():
+            edits[edit].extend(features)
 
-    for trace in traces:
-        nodes, trace_submissions, trace_queries = trace_graph(trace)
-        counted_tokens = []
-        # Update the submissions/queries counters (use normalized variables).
-        for (submission, correct) in trace_submissions:
-            if correct:
-                tokens = list(tokenize(submission))
-                for token in tokens:
-                    if token.type == 'NAME' and token.val not in counted_tokens:
-                        names[token.val] += 1
-                        counted_tokens.append(token.val)
-                code = stringify(rename_vars(tokens))
-                submissions[code] += 1
+        # Update submission/query counters (use normalized variables).
+        for code, correct in trace_submissions:
+            code = stringify(rename_vars(tokenize(code)))
+            submissions[(code, correct)] += 1
         for query in trace_queries:
             code = stringify(rename_vars(tokenize(query)))
             queries[code] += 1
 
-        # Update the edit and leaf/node counters.
-        edits.update(graph_edits(nodes))
-        n_leaf.update(set([normalized(n.data[2]) for n in nodes if n.data[2] and not n.eout]))
-        n_all.update(set([normalized(n.data[2]) for n in nodes if n.data[2]]))
-
     # Discard edits that only occur in one trace.
-    singletons = [edit for edit in edits if edits[edit] < 2]
+    singletons = [edit for edit in edits if len(edits[edit]) < 2]
     for edit in singletons:
         del edits[edit]
 
+    n_start = collections.Counter()
+    n_start_all = 0
+    for (path, a, b), features in edits.items():
+        edits[(path, a, b)] = len(features)
+        n_start[(path, a)] += len(features)
+        n_start_all += len(features)
+
     # Find the probability of each edit a → b.
-    for (a, b), count in edits.items():
-        p = 1.0
-        if a:
-            p *= 1 - (n_leaf[a] / (n_all[a]+1))
-        if b:
-            b_normal = normalized(b)
-            p *= n_leaf[b_normal] / (n_all[b_normal]+1)
-        if a and b:
-            p = math.sqrt(p)
-        edits[(a, b)] = p
+    new_edits = {}
+    for (path, a, b), count in edits.items():
+        if a != b:
+            p = count / n_start[(path, a)]
+            new_edits[(path, a, b)] = p
+    edits = new_edits
 
     # Tweak the edit distribution to improve search.
     avg_p = avg(edits.values())
     for edit, p in edits.items():
         edits[edit] = logistic(p, k=3, x_0=avg_p)
 
-    return edits, submissions, queries, names
+    return edits, submissions, queries
 
 def classify_edits(edits):
     inserts = {}
@@ -255,7 +194,9 @@ def classify_edits(edits):
 # Extract edits and other data from existing traces for each problem.
 if __name__ == '__main__':
     import pickle
-    from db.models import Problem, Solution
+    from db.models import CodeqUser, Problem, Solution
+    import server.problems
+    from termcolor import colored
 
     # Ignore traces from these users.
     ignored_users = [
@@ -265,21 +206,32 @@ if __name__ == '__main__':
         358, # sasha
     ]
 
-    edits, submissions, queries, names = {}, {}, {}, {}
+    edits, submissions, queries = {}, {}, {}
     for problem in Problem.list():
         pid = problem.id
-        solutions = Solution.filter(problem_id=pid, done=True)
-        traces = [s.trace for s in solutions if s.codeq_user_id not in ignored_users]
-        if not traces:
+        solutions = [s for s in Solution.filter(problem_id=pid, done=True)
+                       if s.codeq_user_id not in ignored_users]
+        if not solutions:
             print('No traces for {}'.format(problem.identifier))
             continue
 
+        # Testing function.
+        solved_problems = [p for p in CodeqUser.solved_problems(1, problem.language)
+                                   if p != (problem.group, problem.identifier)]
+        other_solutions = server.problems.solutions_for_problems(problem.language, solved_problems)
+        problem_module = server.problems.load_problem(problem.language, problem.group, problem.identifier, 'common')
+        def test(code):
+            correct, hints = problem_module.test(code, solved_problems)
+            return correct
+
         print('Analyzing traces for {}… '.format(problem.identifier), end='', flush=True)
+        print('{} traces… '.format(len(solutions)), end='', flush=True)
         try:
-            edits[pid], submissions[pid], queries[pid], names[pid] = get_edits_from_traces(traces)
-            print('{} edits, {} submissions, {} queries, {} names'.format(
-                len(edits[pid]), len(submissions[pid]), len(queries[pid]), len(names[pid])))
+            edits[pid], submissions[pid], queries[pid] = get_edits_from_solutions(solutions, test)
+            print('{} edits, {} submissions, {} queries'.format(
+                len(edits[pid]), len(submissions[pid]), len(queries[pid])))
         except Exception as ex:
-            print('error: {}'.format(ex))
+            import traceback
+            traceback.print_exc()
 
-    pickle.dump((edits, submissions, queries, names), open('edits.pickle', 'wb'))
+    pickle.dump((edits, submissions, queries), open('edits.pickle', 'wb'))
author	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2015-11-18 13:20:00 +0100
committer	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2015-12-11 16:11:15 +0100
commit	fa39fe7bfedd0b2e615d369adb5b510ceb9b857f (patch)
tree	abf381b007d1816247cab534ac8e2152695ad596 /monkey/edits.py
parent	dd723bd01634fa5ffc85402ea10947e472b257af (diff)