diff options
author | Timotej Lazar <timotej.lazar@araneo.org> | 2015-01-15 12:10:22 +0100 |
---|---|---|
committer | Aleš Smodiš <aless@guru.si> | 2015-08-11 14:26:01 +0200 |
commit | 819ab10281c9bd6c000364c3a243959edd18abf7 (patch) | |
tree | 5ca3452418b49781563221bb56cf70e1d0fb1bb8 /monkey/edits.py | |
parent | d86793039957aa408a98806aecfb5964bda5fb87 (diff) |
Move pymonkey stuff to monkey/
Importing pymonkey into webmonkey, let's see how this works.
Diffstat (limited to 'monkey/edits.py')
-rw-r--r-- | monkey/edits.py | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/monkey/edits.py b/monkey/edits.py new file mode 100644 index 0000000..fef591a --- /dev/null +++ b/monkey/edits.py @@ -0,0 +1,307 @@ +#!/usr/bin/python3 + +import collections + +from action import expand, parse +from graph import Node +from prolog.util import rename_vars, stringify, tokenize +from util import get_line + +# A line edit is a contiguous sequences of actions within a single line. This +# function takes a sequence of actions and builds a directed acyclic graph +# where each edit represents one line edit and each node represents a version +# of some line. The function returns a list of nodes (first element is the +# root), and sets of submissions (program versions tested by the user) and +# queries in this attempt. +def edit_graph(actions, debug=False): + # Return values. + nodes = [Node([0, 0, ()])] # Node data: rank (Y), line no. (X), and tokens. + submissions = set() # Program versions at 'test' actions. + queries = set() # Queries run by the student. + + # State variables. + leaves = {0: nodes[0]} # Current leaf node for each line. + rank = 1 # Rank (order / y-position) for the next node. + code_next = '' # Program code after applying the current action. + + # Ensure there is a separate action for each inserted/removed character. + expand(actions) + for action_id, action in enumerate(actions): + code = code_next + code_next = action.apply(code) + + if action.type == 'test': + submissions.add(code) + + elif action.type == 'solve' or action.type == 'solve_all': + queries.add(action.query) + + elif action.type == 'insert' or action.type == 'remove': + # Number of the changed line. + line = code[:action.offset].count('\n') + # Current leaf node for this line. + parent = leaves[line] + # Tokens in this line after applying [action]. + tokens_next = tuple(tokenize(get_line(code_next, line))) + + # If a new node is inserted, clone each leaf into the next rank. + # This makes it easier to print the graph for graphviz; when + # analyzing the graph, duplicate nodes without siblings should be + # ignored. + new_leaves = {} + + if action.text == '\n': + if action.type == 'insert': + tokens_next_right = tuple(tokenize(get_line(code_next, line+1))) + + child_left = Node([rank, line, tokens_next]) + parent.add_out(child_left) + + child_right = Node([rank, line+1, tokens_next_right]) + parent.add_out(child_right) + + # Create new leaf nodes. + for i, leaf in leaves.items(): + if i < line: + new_leaves[i] = Node([rank, i, leaf.data[2]]) + leaf.add_out(new_leaves[i]) + elif i > line: + new_leaves[i+1] = Node([rank, i+1, leaf.data[2]]) + leaf.add_out(new_leaves[i+1]) + new_leaves[line] = child_left + new_leaves[line+1] = child_right + + elif action.type == 'remove': + parent_right = leaves[line+1] + + child = Node([rank, line, tokens_next]) + parent_right.add_out(child) + parent.add_out(child) + + # Create new leaf nodes. + for i, leaf in leaves.items(): + if i < line: + new_leaves[i] = Node([rank, i, leaf.data[2]]) + leaf.add_out(new_leaves[i]) + elif i > line+1: + new_leaves[i-1] = Node([rank, i-1, leaf.data[2]]) + leaf.add_out(new_leaves[i-1]) + new_leaves[line] = child + else: + # Skip the node if the next action is insert/remove (except \n) + # on the same line. + if action_id < len(actions)-1: + action_next = actions[action_id+1] + if action_next.type in ('insert', 'remove'): + line_next = code_next[:action_next.offset].count('\n') + if action_next.text != '\n' and line == line_next: + continue + + # Skip the node if it is the same as the parent. + if tokens_next == parent.data[2]: + continue + + child = Node([rank, line, tokens_next]) + parent.add_out(child) + + # Create new leaf nodes. + for i, leaf in leaves.items(): + if i != line: + new_leaves[i] = Node([rank, i, leaf.data[2]]) + leaf.add_out(new_leaves[i]) + new_leaves[line] = child + + leaves = new_leaves + nodes += leaves.values() + rank += 1 + + return nodes, submissions, queries + +# Return all interesting edit paths in the edit graph rooted at [root]. +def get_paths(root, path=tuple(), done=None): + if done is None: + done = set() + + cur_path = list(path) + if len(path) == 0 or path[-1] != root.data[2]: + cur_path.append(root.data[2]) + + # leaf node + if len(root.eout) == 0: + yield tuple(cur_path) + # empty node + elif len(path) > 1 and len(root.data[2]) == 0: + yield tuple(cur_path) + + if len(root.data[2]) > 0: + new_path = cur_path + else: + new_path = [root.data[2]] + done.add(root) + + for node in root.eout: + if node not in done: + yield from get_paths(node, tuple(new_path), done) + +# Build an edit graph for each trace and find "meaningful" (to be defined) +# edits. Return a dictionary of edits and their frequencies, and also +# submissions and queries in [traces]. +def get_edits_from_traces(traces): + # Helper function to remove trailing punctuation from lines. This is a + # rather ugly performance-boosting hack. + def remove_punct(line): + if line and line[-1].type in ('COMMA', 'PERIOD', 'SEMI', 'FROM'): + return line[:-1] + return line + + # Return values: counts for observed edits, lines, submissions and queries. + edits = collections.Counter() + lines = collections.Counter() + submissions = collections.Counter() + queries = collections.Counter() + + for trace in traces: + try: + actions = parse(trace) + except: + continue + nodes, trace_submissions, trace_queries = edit_graph(actions) + + # Update the submissions/queries counters; rename variables first to + # remove trivial differences. + for submission in trace_submissions: + tokens = tokenize(submission) + rename_vars(tokens) + code = stringify(tokens) + submissions[code] += 1 + + for query in trace_queries: + tokens = tokenize(query) + rename_vars(tokens) + code = stringify(tokens) + queries[code] += 1 + + # Get edits. + for path in get_paths(nodes[0]): + for i in range(len(path)): + start = list(remove_punct(path[i])) + var_names = rename_vars(start) + start_t = tuple(start) + + for j in range(len(path[i+1:])): + end = list(remove_punct(path[i+1+j])) + rename_vars(end, var_names) + end_t = tuple(end) + + if start_t != end_t: + edit = (start_t, end_t) + edits[edit] += 1 + lines[start_t] += 1 + + # Discard rarely occurring edits. XXX only for testing + singletons = [edit for edit in edits if edits[edit] < 2] + for edit in singletons: + lines[edit[0]] -= edits[edit] + del edits[edit] + + # Get the probability of each edit given its 'before' line. + for before, after in edits: + edits[(before, after)] /= lines[before] + + # Normalize line frequencies. + if len(lines) > 0: + lines_max = max(lines.values()) + lines = {line: count/lines_max for line, count in lines.items()} + + return edits, lines, submissions, queries + +def classify_edits(edits): + inserts = {} + removes = {} + changes = {} + for (before, after), cost in edits.items(): + if after and not before: + inserts[after] = cost + elif before and not after: + removes[before] = cost + else: + changes[(before, after)] = cost + return inserts, removes, changes + +# Simplify an edit graph with given nodes: remove empty leaf nodes and other +# fluff. The function is called recursively until no more changes are done. +def clean_graph(nodes): + changed = False + + # A + # | --> A (when B is an empty leaf) + # B + for node in nodes: + if len(node.eout) == 0 and len(node.ein) == 1 and len(node.data[2]) == 0: + parent = node.ein[0] + parent.eout.remove(node) + nodes.remove(node) + changed = True + break + + # A + # | --> A + # A + for node in nodes: + if len(node.ein) == 1: + parent = node.ein[0] + if len(parent.eout) == 1 and node.data[2] == parent.data[2]: + parent.eout = node.eout + for child in parent.eout: + child.ein = [parent if node == node else node for node in child.ein] + nodes.remove(node) + changed = True + break + + # A A + # |\ | + # | C --> | (when C is empty) + # |/ | + # B B + for node in nodes: + if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1: + parent = node.ein[0] + child = node.eout[0] + if len(parent.eout) == 2 and len(child.ein) == 2: + parent.eout = [n for n in parent.eout if n != node] + child.ein = [n for n in child.ein if n != node] + nodes.remove(node) + changed = True + break + + # A + # | + # C --> A + # | + # A + for node in nodes: + if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1: + parent = node.ein[0] + child = node.eout[0] + if len(parent.eout) == 1 and len(child.ein) == 1 and parent.data[2] == child.data[2]: + parent.eout = [child] + child.ein = [parent] + nodes.remove(node) + changed = True + break + + if changed: + # go again until nothing changes + clean_graph(nodes) + else: + # compact node ranks + ranks = set([node.data[0] for node in nodes]) + missing = set(range(1,max(ranks)+1)) - ranks + + for node in nodes: + diff = 0 + for rank in sorted(missing): + if rank >= node.data[0]: + break + diff += 1 + node.data[0] -= diff |