#!/usr/bin/python3 import collections import math from .action import expand, parse from .graph import Node from prolog.util import annotate, normalized, rename_vars, stringify, tokenize from .util import get_line, avg, logistic # Parse the sequence of actions in [trace] and return a directed acyclic graph # representing development history. Each node represents a particular version # of some line, and each edge represents a "line edit" (contiguous sequence of # inserts/removes within a single line). # Return a list of nodes with root as the first element. Also return sets of # submissions (user-tested program versions) and queries in this attempt. def trace_graph(trace): # Return values. nodes = [Node([0, 0, ()])] # Node data: rank (Y), line no. (X), and tokens. submissions = set() # Program versions at 'test' actions. queries = set() # Queries run by the student. # State variables. leaves = {0: nodes[0]} # Current leaf node for each line. rank = 1 # Rank (order / y-position) for the next node. code_next = '' # Program code after applying the current action. done = False # Set to True on first correct version. # Parse trace actions and ensure there is a separate action for each # inserted/removed character. try: actions = parse(trace) expand(actions) except: # Only a few traces fail to parse, so just skip them. actions = [] for action_id, action in enumerate(actions): code = code_next code_next = action.apply(code) if action.type == 'test': submissions.add((code, action.total == action.passed)) if action.total == action.passed: done = True elif action.type == 'solve' or action.type == 'solve_all': queries.add(action.query) elif action.type == 'insert' or action.type == 'remove': # Ignore edits after the first correct version. if done: continue # Number of the changed line. line = code[:action.offset].count('\n') # Current leaf node for this line. parent = leaves[line] # Tokens in this line after applying [action]. tokens_next = tuple(tokenize(get_line(code_next, line))) # If a new node is inserted, clone each leaf into the next rank. # This makes it easier to print the graph for graphviz; when # analyzing the graph, duplicate nodes without siblings should be # ignored. new_leaves = {} if action.text == '\n': if action.type == 'insert': tokens_next_right = tuple(tokenize(get_line(code_next, line+1))) child_left = Node([rank, line, tokens_next]) parent.add_out(child_left) child_right = Node([rank, line+1, tokens_next_right]) parent.add_out(child_right) # Create new leaf nodes. for i, leaf in leaves.items(): if i < line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) elif i > line: new_leaves[i+1] = Node([rank, i+1, leaf.data[2]]) leaf.add_out(new_leaves[i+1]) new_leaves[line] = child_left new_leaves[line+1] = child_right elif action.type == 'remove': parent_right = leaves[line+1] child = Node([rank, line, tokens_next]) parent_right.add_out(child) parent.add_out(child) # Create new leaf nodes. for i, leaf in leaves.items(): if i < line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) elif i > line+1: new_leaves[i-1] = Node([rank, i-1, leaf.data[2]]) leaf.add_out(new_leaves[i-1]) new_leaves[line] = child else: # Skip the node if the next action is insert/remove (except \n) # on the same line. if action_id < len(actions)-1: action_next = actions[action_id+1] if action_next.type in ('insert', 'remove'): line_next = code_next[:action_next.offset].count('\n') if action_next.text != '\n' and line == line_next: continue # Skip the node if it is the same as the parent. if tokens_next == parent.data[2]: continue child = Node([rank, line, tokens_next]) parent.add_out(child) # Create new leaf nodes. for i, leaf in leaves.items(): if i != line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) new_leaves[line] = child leaves = new_leaves nodes += leaves.values() rank += 1 return nodes, submissions, queries # Return a set of edits that appear in the trace_graph given by [nodes]. def graph_edits(nodes): edits = set() for a in nodes: a_data = a.data[2] for b in a.eout: b_data = b.data[2] # Normalize a → b into start → end. Reuse variable names from a # when normalizing b. var_names = {} start = normalized(a_data, var_names) end = normalized(b_data, var_names) if start == end: continue # An edit start → ε happens each time the user inserts \n; ignore # such cases. if not end and len(a.eout) > 1: continue # Skip edits where start/end is composed of more than one part. # TODO improve trace_graph to handle this instead. if [t for t in annotate(stringify(start)) if t.part > 0]: continue if [t for t in annotate(stringify(end)) if t.part > 0]: continue edits.add((start, end)) return edits # Build an edit graph for each trace and find "meaningful" (to be defined) # edits. Return a dictionary of edits and their frequencies, and also # submissions and queries in [traces]. def get_edits_from_traces(traces): # Return values: counts for observed edits, lines, submissions and queries. edits = collections.Counter() submissions = collections.Counter() queries = collections.Counter() names = collections.Counter() # Counts of traces where each line appears as a leaf / any node. n_leaf = collections.Counter() n_all = collections.Counter() for trace in traces: nodes, trace_submissions, trace_queries = trace_graph(trace) counted_tokens = [] # Update the submissions/queries counters (use normalized variables). for (submission, correct) in trace_submissions: if correct: tokens = list(tokenize(submission)) for token in tokens: if token.type == 'NAME' and token.val not in counted_tokens: names[token.val] += 1 counted_tokens.append(token.val) code = stringify(rename_vars(tokens)) submissions[code] += 1 for query in trace_queries: code = stringify(rename_vars(tokenize(query))) queries[code] += 1 # Update the edit and leaf/node counters. edits.update(graph_edits(nodes)) n_leaf.update(set([normalized(n.data[2]) for n in nodes if n.data[2] and not n.eout])) n_all.update(set([normalized(n.data[2]) for n in nodes if n.data[2]])) # Discard edits that only occur in one trace. singletons = [edit for edit in edits if edits[edit] < 2] for edit in singletons: del edits[edit] # Find the probability of each edit a → b. for (a, b), count in edits.items(): p = 1.0 if a: p *= 1 - (n_leaf[a] / (n_all[a]+1)) if b: b_normal = normalized(b) p *= n_leaf[b_normal] / (n_all[b_normal]+1) if a and b: p = math.sqrt(p) edits[(a, b)] = p # Tweak the edit distribution to improve search. avg_p = avg(edits.values()) for edit, p in edits.items(): edits[edit] = logistic(p, k=3, x_0=avg_p) return edits, submissions, queries, names def classify_edits(edits): inserts = {} removes = {} changes = {} for (before, after), cost in edits.items(): if after and not before: inserts[after] = cost elif before and not after: removes[before] = cost else: changes[(before, after)] = cost return inserts, removes, changes # Extract edits and other data from existing traces for each problem. if __name__ == '__main__': import pickle from db.models import Problem, Solution # Ignore traces from these users. ignored_users = [ 1, # admin 231, # test 360, # test2 358, # sasha ] edits, submissions, queries, names = {}, {}, {}, {} for problem in Problem.list(): pid = problem.id solutions = Solution.filter(problem_id=pid, done=True) traces = [s.trace for s in solutions if s.codeq_user_id not in ignored_users] if not traces: print('No traces for {}'.format(problem.name)) continue print('Analyzing traces for {}… '.format(problem.name), end='', flush=True) try: edits[pid], submissions[pid], queries[pid], names[pid] = get_edits_from_traces(traces) print('{} edits, {} submissions, {} queries, {} names'.format( len(edits[pid]), len(submissions[pid]), len(queries[pid]), len(names[pid]))) except Exception as ex: print('error: {}'.format(ex)) pickle.dump((edits, submissions, queries, names), open('edits.pickle', 'wb'))