#!/usr/bin/python3 import collections import math from .action import expand, parse from .graph import Node from prolog.util import rename_vars, stringify, tokenize from .util import get_line, avg, logistic # A line edit is a contiguous sequences of actions within a single line. This # function takes a sequence of actions and builds a directed acyclic graph # where each edit represents one line edit and each node represents a version # of some line. The function returns a list of nodes (first element is the # root), and sets of submissions (program versions tested by the user) and # queries in this attempt. def edit_graph(actions, debug=False): # Return values. nodes = [Node([0, 0, ()])] # Node data: rank (Y), line no. (X), and tokens. submissions = set() # Program versions at 'test' actions. queries = set() # Queries run by the student. # State variables. leaves = {0: nodes[0]} # Current leaf node for each line. rank = 1 # Rank (order / y-position) for the next node. code_next = '' # Program code after applying the current action. # Ensure there is a separate action for each inserted/removed character. expand(actions) for action_id, action in enumerate(actions): code = code_next code_next = action.apply(code) if action.type == 'test': submissions.add(code) elif action.type == 'solve' or action.type == 'solve_all': queries.add(action.query) elif action.type == 'insert' or action.type == 'remove': # Number of the changed line. line = code[:action.offset].count('\n') # Current leaf node for this line. parent = leaves[line] # Tokens in this line after applying [action]. tokens_next = tuple(tokenize(get_line(code_next, line))) # If a new node is inserted, clone each leaf into the next rank. # This makes it easier to print the graph for graphviz; when # analyzing the graph, duplicate nodes without siblings should be # ignored. new_leaves = {} if action.text == '\n': if action.type == 'insert': tokens_next_right = tuple(tokenize(get_line(code_next, line+1))) child_left = Node([rank, line, tokens_next]) parent.add_out(child_left) child_right = Node([rank, line+1, tokens_next_right]) parent.add_out(child_right) # Create new leaf nodes. for i, leaf in leaves.items(): if i < line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) elif i > line: new_leaves[i+1] = Node([rank, i+1, leaf.data[2]]) leaf.add_out(new_leaves[i+1]) new_leaves[line] = child_left new_leaves[line+1] = child_right elif action.type == 'remove': parent_right = leaves[line+1] child = Node([rank, line, tokens_next]) parent_right.add_out(child) parent.add_out(child) # Create new leaf nodes. for i, leaf in leaves.items(): if i < line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) elif i > line+1: new_leaves[i-1] = Node([rank, i-1, leaf.data[2]]) leaf.add_out(new_leaves[i-1]) new_leaves[line] = child else: # Skip the node if the next action is insert/remove (except \n) # on the same line. if action_id < len(actions)-1: action_next = actions[action_id+1] if action_next.type in ('insert', 'remove'): line_next = code_next[:action_next.offset].count('\n') if action_next.text != '\n' and line == line_next: continue # Skip the node if it is the same as the parent. if tokens_next == parent.data[2]: continue child = Node([rank, line, tokens_next]) parent.add_out(child) # Create new leaf nodes. for i, leaf in leaves.items(): if i != line: new_leaves[i] = Node([rank, i, leaf.data[2]]) leaf.add_out(new_leaves[i]) new_leaves[line] = child leaves = new_leaves nodes += leaves.values() rank += 1 return nodes, submissions, queries # Generate all interesting paths in the edit graph rooted at [root]. def get_paths(root, path=None, done=None): if done is None: done = set() # Add [root] to [path] if it is the first node or different than previous. if not path: path = (root.data[2],) elif root.data[2] != path[-1]: path = path + (root.data[2],) # Return the current path if [root] is a leaf or an empty node. if len(path) > 1: if not root.eout or not root.data[2]: yield path # If [root] is an empty node, start a new path. if not root.data[2]: path = (root.data[2],) done.add(root) for node in root.eout: if node not in done: yield from get_paths(node, path, done) # Build an edit graph for each trace and find "meaningful" (to be defined) # edits. Return a dictionary of edits and their frequencies, and also # submissions and queries in [traces]. def get_edits_from_traces(traces): # Helper function to remove trailing punctuation from lines and rename # variables to A1,A2,A3,… (potentially using [var_names]). Return a tuple. def normalize(line, var_names=None): # Remove trailing punctuation. i = len(line) while i > 0: if line[i-1].type not in ('COMMA', 'PERIOD', 'SEMI'): break i -= 1 return tuple(rename_vars(line[:i], var_names)) # Return values: counts for observed edits, lines, submissions and queries. edits = collections.Counter() submissions = collections.Counter() queries = collections.Counter() # Counts of traces where each line appears as a leaf / any node. n_leaf = collections.Counter() n_all = collections.Counter() for trace in traces: try: actions = parse(trace) except: # Only a few traces fail to parse, so just ignore them. continue nodes, trace_submissions, trace_queries = edit_graph(actions) # Update the submissions/queries counters (use normalized variables). for submission in trace_submissions: code = stringify(rename_vars(tokenize(submission))) submissions[code] += 1 for query in trace_queries: code = stringify(rename_vars(tokenize(query))) queries[code] += 1 # Get edits. trace_edits = set() for path in get_paths(nodes[0]): for i in range(1, len(path)): # Normalize path[i-1] → path[i] into start → end. Reuse # variable names from start when normalizing end. var_names = {} start = normalize(path[i-1], var_names) end = normalize(path[i], var_names) # Disallow edits that insert a whole rule (a → … :- …). # TODO improve edit_graph to handle this. if 'FROM' in [t.type for t in end[:-1]]: continue # This should always succeed but check anyway. if start != end: edit = (start, end) trace_edits.add(edit) edits.update(trace_edits) # Update node counts. n_leaf.update(set([normalize(n.data[2]) for n in nodes if n.data[2] and not n.eout])) n_all.update(set([normalize(n.data[2]) for n in nodes if n.data[2]])) # Discard edits that only occur in one trace. singletons = [edit for edit in edits if edits[edit] < 2] for edit in singletons: del edits[edit] # Find the probability of each edit a → b. for (a, b), count in edits.items(): p = 1.0 if a: p *= 1 - (n_leaf[a] / (n_all[a]+1)) if b: b_normal = normalize(b) p *= n_leaf[b_normal] / (n_all[b_normal]+1) if a and b: p = math.sqrt(p) edits[(a, b)] = p # Tweak the edit distribution to improve search. avg_p = avg(edits.values()) for edit, p in edits.items(): edits[edit] = logistic(p, k=3, x_0=avg_p) lines = dict(n_leaf) return edits, lines, submissions, queries def classify_edits(edits): inserts = {} removes = {} changes = {} for (before, after), cost in edits.items(): if after and not before: inserts[after] = cost elif before and not after: removes[before] = cost else: changes[(before, after)] = cost return inserts, removes, changes # Simplify an edit graph with given nodes: remove empty leaf nodes and other # fluff. The function is called recursively until no more changes are done. def clean_graph(nodes): changed = False # A # | --> A (when B is an empty leaf) # B for node in nodes: if len(node.eout) == 0 and len(node.ein) == 1 and len(node.data[2]) == 0: parent = node.ein[0] parent.eout.remove(node) nodes.remove(node) changed = True break # A # | --> A # A for node in nodes: if len(node.ein) == 1: parent = node.ein[0] if len(parent.eout) == 1 and node.data[2] == parent.data[2]: parent.eout = node.eout for child in parent.eout: child.ein = [parent if node == node else node for node in child.ein] nodes.remove(node) changed = True break # A A # |\ | # | C --> | (when C is empty) # |/ | # B B for node in nodes: if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1: parent = node.ein[0] child = node.eout[0] if len(parent.eout) == 2 and len(child.ein) == 2: parent.eout = [n for n in parent.eout if n != node] child.ein = [n for n in child.ein if n != node] nodes.remove(node) changed = True break # A # | # C --> A # | # A for node in nodes: if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1: parent = node.ein[0] child = node.eout[0] if len(parent.eout) == 1 and len(child.ein) == 1 and parent.data[2] == child.data[2]: parent.eout = [child] child.ein = [parent] nodes.remove(node) changed = True break if changed: # go again until nothing changes clean_graph(nodes) else: # compact node ranks ranks = set([node.data[0] for node in nodes]) missing = set(range(1,max(ranks)+1)) - ranks for node in nodes: diff = 0 for rank in sorted(missing): if rank >= node.data[0]: break diff += 1 node.data[0] -= diff if __name__ == '__main__': import os import pickle import django # Load django models. os.environ['DJANGO_SETTINGS_MODULE'] = 'webmonkey.settings' django.setup() from django.contrib.auth.models import User from tutor.models import Attempt, Problem edits = {} lines = {} submissions = {} queries = {} for problem in Problem.objects.all(): print(problem.name) pid = problem.pk traces = [a.trace for a in Attempt.objects.filter(problem=problem, done=True)] edits[pid], lines[pid], submissions[pid], queries[pid] = get_edits_from_traces(traces) pickle.dump((edits, lines, submissions, queries), open('edits.pickle', 'wb'))