#!/usr/bin/python3 import collections from .action import expand, parse from .graph import Node from .prolog.util import rename_vars, stringify, tokenize from .util import get_line # A line edit is a contiguous sequences of actions within a single line. This # function takes a sequence of actions and builds a directed acyclic graph # where each edit represents one line edit and each node represents a version # of some line. The function returns a list of nodes (first element is the # root), and sets of submissions (program versions tested by the user) and # queries in this attempt. def edit_graph(actions, debug=False): # Return values. nodes = [Node([0, 0, ()])] # Node data: rank (Y), line no. (X), and tokens. submissions = set() # Program versions at 'test' actions. queries = set() # Queries run by the student. # State variables. leaves = {0: nodes[0]} # Current leaf node for each line. rank = 1 # Rank (order / y-position) for the next node. code_next = '' # Program code after applying the current action. # Ensure there is a separate action for each inserted/removed character. expand(actions) for action_id, action in enumerate(actions): code = code_next code_next = action.apply(code) if action.type == 'test': submissions.add(code) elif action.type == 'solve' or action.type == 'solve_all': queries.add(action.query) elif action.type == 'insert' or action.type == 'remove': # Number of the changed line. line = code[:action.offset].count('\n') # Current leaf node for this line. parent = leaves[line] # Tokens in this line after applying [action]. tokens_next = tuple(tokenize(get_line(code_next, line))) # If a new node is inserted, clone each leaf into the next rank. # This makes it easier to print the graph for graphviz; when # analyzing the graph, duplicate nodes without siblings should be # ignored. new_leaves = {} if action.text == '\n': if action.type == 'insert': tokens_next_right = tuple(tokenize(get_line(code_next, line+1))) child_left = Node([rank, line, tokens_next]) parent.add_out(child_left) child_right = Node([rank, line+1, tokens_next_right]) parent.add_out(child_right) # Create new leaf nodes. for i, leaf in leaves.items(): if i < line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) elif i > line: new_leaves[i+1] = Node([rank, i+1, leaf.data[2]]) leaf.add_out(new_leaves[i+1]) new_leaves[line] = child_left new_leaves[line+1] = child_right elif action.type == 'remove': parent_right = leaves[line+1] child = Node([rank, line, tokens_next]) parent_right.add_out(child) parent.add_out(child) # Create new leaf nodes. for i, leaf in leaves.items(): if i < line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) elif i > line+1: new_leaves[i-1] = Node([rank, i-1, leaf.data[2]]) leaf.add_out(new_leaves[i-1]) new_leaves[line] = child else: # Skip the node if the next action is insert/remove (except \n) # on the same line. if action_id < len(actions)-1: action_next = actions[action_id+1] if action_next.type in ('insert', 'remove'): line_next = code_next[:action_next.offset].count('\n') if action_next.text != '\n' and line == line_next: continue # Skip the node if it is the same as the parent. if tokens_next == parent.data[2]: continue child = Node([rank, line, tokens_next]) parent.add_out(child) # Create new leaf nodes. for i, leaf in leaves.items(): if i != line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) new_leaves[line] = child leaves = new_leaves nodes += leaves.values() rank += 1 return nodes, submissions, queries # Return all interesting edit paths in the edit graph rooted at [root]. def get_paths(root, path=tuple(), done=None): if done is None: done = set() cur_path = list(path) if len(path) == 0 or path[-1] != root.data[2]: cur_path.append(root.data[2]) # leaf node if len(root.eout) == 0: yield tuple(cur_path) # empty node elif len(path) > 1 and len(root.data[2]) == 0: yield tuple(cur_path) if len(root.data[2]) > 0: new_path = cur_path else: new_path = [root.data[2]] done.add(root) for node in root.eout: if node not in done: yield from get_paths(node, tuple(new_path), done) # Build an edit graph for each trace and find "meaningful" (to be defined) # edits. Return a dictionary of edits and their frequencies, and also # submissions and queries in [traces]. def get_edits_from_traces(traces): # Helper function to remove trailing punctuation from lines. This is a # rather ugly performance-boosting hack. def remove_punct(line): if line and line[-1].type in ('COMMA', 'PERIOD', 'SEMI', 'FROM'): return line[:-1] return line # Return values: counts for observed edits, lines, submissions and queries. edits = collections.Counter() lines = collections.Counter() submissions = collections.Counter() queries = collections.Counter() for trace in traces: try: actions = parse(trace) except: continue nodes, trace_submissions, trace_queries = edit_graph(actions) # Update the submissions/queries counters; rename variables first to # remove trivial differences. for submission in trace_submissions: tokens = tokenize(submission) rename_vars(tokens) code = stringify(tokens) submissions[code] += 1 for query in trace_queries: tokens = tokenize(query) rename_vars(tokens) code = stringify(tokens) queries[code] += 1 # Get edits. for path in get_paths(nodes[0]): for i in range(len(path)): start = list(remove_punct(path[i])) var_names = rename_vars(start) start_t = tuple(start) for j in range(len(path[i+1:])): end = list(remove_punct(path[i+1+j])) rename_vars(end, var_names) end_t = tuple(end) if start_t != end_t: edit = (start_t, end_t) edits[edit] += 1 lines[start_t] += 1 # Discard rarely occurring edits. XXX only for testing singletons = [edit for edit in edits if edits[edit] < 2] for edit in singletons: lines[edit[0]] -= edits[edit] del edits[edit] # Get the probability of each edit given its 'before' line. for before, after in edits: edits[(before, after)] /= lines[before] # Normalize line frequencies. if len(lines) > 0: lines_max = max(lines.values()) lines = {line: count/lines_max for line, count in lines.items()} return edits, lines, submissions, queries def classify_edits(edits): inserts = {} removes = {} changes = {} for (before, after), cost in edits.items(): if after and not before: inserts[after] = cost elif before and not after: removes[before] = cost else: changes[(before, after)] = cost return inserts, removes, changes # Simplify an edit graph with given nodes: remove empty leaf nodes and other # fluff. The function is called recursively until no more changes are done. def clean_graph(nodes): changed = False # A # | --> A (when B is an empty leaf) # B for node in nodes: if len(node.eout) == 0 and len(node.ein) == 1 and len(node.data[2]) == 0: parent = node.ein[0] parent.eout.remove(node) nodes.remove(node) changed = True break # A # | --> A # A for node in nodes: if len(node.ein) == 1: parent = node.ein[0] if len(parent.eout) == 1 and node.data[2] == parent.data[2]: parent.eout = node.eout for child in parent.eout: child.ein = [parent if node == node else node for node in child.ein] nodes.remove(node) changed = True break # A A # |\ | # | C --> | (when C is empty) # |/ | # B B for node in nodes: if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1: parent = node.ein[0] child = node.eout[0] if len(parent.eout) == 2 and len(child.ein) == 2: parent.eout = [n for n in parent.eout if n != node] child.ein = [n for n in child.ein if n != node] nodes.remove(node) changed = True break # A # | # C --> A # | # A for node in nodes: if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1: parent = node.ein[0] child = node.eout[0] if len(parent.eout) == 1 and len(child.ein) == 1 and parent.data[2] == child.data[2]: parent.eout = [child] child.ein = [parent] nodes.remove(node) changed = True break if changed: # go again until nothing changes clean_graph(nodes) else: # compact node ranks ranks = set([node.data[0] for node in nodes]) missing = set(range(1,max(ranks)+1)) - ranks for node in nodes: diff = 0 for rank in sorted(missing): if rank >= node.data[0]: break diff += 1 node.data[0] -= diff if __name__ == '__main__': import os import pickle import django # Load django models. os.environ['DJANGO_SETTINGS_MODULE'] = 'webmonkey.settings' django.setup() from django.contrib.auth.models import User from tutor.models import Attempt, Problem edits = {} lines = {} submissions = {} queries = {} for problem in Problem.objects.all(): pid = problem.pk traces = [a.trace for a in Attempt.objects.filter(problem=problem, done=True)] edits[pid], lines[pid], submissions[pid], queries[pid] = get_edits_from_traces(traces) pickle.dump((edits, lines, submissions, queries), open('edits.pickle', 'wb'))