summaryrefslogtreecommitdiff
path: root/monkey/edits.py
diff options
context:
space:
mode:
authorTimotej Lazar <timotej.lazar@araneo.org>2015-01-15 12:10:22 +0100
committerAleš Smodiš <aless@guru.si>2015-08-11 14:26:01 +0200
commit819ab10281c9bd6c000364c3a243959edd18abf7 (patch)
tree5ca3452418b49781563221bb56cf70e1d0fb1bb8 /monkey/edits.py
parentd86793039957aa408a98806aecfb5964bda5fb87 (diff)
Move pymonkey stuff to monkey/
Importing pymonkey into webmonkey, let's see how this works.
Diffstat (limited to 'monkey/edits.py')
-rw-r--r--monkey/edits.py307
1 files changed, 307 insertions, 0 deletions
diff --git a/monkey/edits.py b/monkey/edits.py
new file mode 100644
index 0000000..fef591a
--- /dev/null
+++ b/monkey/edits.py
@@ -0,0 +1,307 @@
+#!/usr/bin/python3
+
+import collections
+
+from action import expand, parse
+from graph import Node
+from prolog.util import rename_vars, stringify, tokenize
+from util import get_line
+
+# A line edit is a contiguous sequences of actions within a single line. This
+# function takes a sequence of actions and builds a directed acyclic graph
+# where each edit represents one line edit and each node represents a version
+# of some line. The function returns a list of nodes (first element is the
+# root), and sets of submissions (program versions tested by the user) and
+# queries in this attempt.
+def edit_graph(actions, debug=False):
+ # Return values.
+ nodes = [Node([0, 0, ()])] # Node data: rank (Y), line no. (X), and tokens.
+ submissions = set() # Program versions at 'test' actions.
+ queries = set() # Queries run by the student.
+
+ # State variables.
+ leaves = {0: nodes[0]} # Current leaf node for each line.
+ rank = 1 # Rank (order / y-position) for the next node.
+ code_next = '' # Program code after applying the current action.
+
+ # Ensure there is a separate action for each inserted/removed character.
+ expand(actions)
+ for action_id, action in enumerate(actions):
+ code = code_next
+ code_next = action.apply(code)
+
+ if action.type == 'test':
+ submissions.add(code)
+
+ elif action.type == 'solve' or action.type == 'solve_all':
+ queries.add(action.query)
+
+ elif action.type == 'insert' or action.type == 'remove':
+ # Number of the changed line.
+ line = code[:action.offset].count('\n')
+ # Current leaf node for this line.
+ parent = leaves[line]
+ # Tokens in this line after applying [action].
+ tokens_next = tuple(tokenize(get_line(code_next, line)))
+
+ # If a new node is inserted, clone each leaf into the next rank.
+ # This makes it easier to print the graph for graphviz; when
+ # analyzing the graph, duplicate nodes without siblings should be
+ # ignored.
+ new_leaves = {}
+
+ if action.text == '\n':
+ if action.type == 'insert':
+ tokens_next_right = tuple(tokenize(get_line(code_next, line+1)))
+
+ child_left = Node([rank, line, tokens_next])
+ parent.add_out(child_left)
+
+ child_right = Node([rank, line+1, tokens_next_right])
+ parent.add_out(child_right)
+
+ # Create new leaf nodes.
+ for i, leaf in leaves.items():
+ if i < line:
+ new_leaves[i] = Node([rank, i, leaf.data[2]])
+ leaf.add_out(new_leaves[i])
+ elif i > line:
+ new_leaves[i+1] = Node([rank, i+1, leaf.data[2]])
+ leaf.add_out(new_leaves[i+1])
+ new_leaves[line] = child_left
+ new_leaves[line+1] = child_right
+
+ elif action.type == 'remove':
+ parent_right = leaves[line+1]
+
+ child = Node([rank, line, tokens_next])
+ parent_right.add_out(child)
+ parent.add_out(child)
+
+ # Create new leaf nodes.
+ for i, leaf in leaves.items():
+ if i < line:
+ new_leaves[i] = Node([rank, i, leaf.data[2]])
+ leaf.add_out(new_leaves[i])
+ elif i > line+1:
+ new_leaves[i-1] = Node([rank, i-1, leaf.data[2]])
+ leaf.add_out(new_leaves[i-1])
+ new_leaves[line] = child
+ else:
+ # Skip the node if the next action is insert/remove (except \n)
+ # on the same line.
+ if action_id < len(actions)-1:
+ action_next = actions[action_id+1]
+ if action_next.type in ('insert', 'remove'):
+ line_next = code_next[:action_next.offset].count('\n')
+ if action_next.text != '\n' and line == line_next:
+ continue
+
+ # Skip the node if it is the same as the parent.
+ if tokens_next == parent.data[2]:
+ continue
+
+ child = Node([rank, line, tokens_next])
+ parent.add_out(child)
+
+ # Create new leaf nodes.
+ for i, leaf in leaves.items():
+ if i != line:
+ new_leaves[i] = Node([rank, i, leaf.data[2]])
+ leaf.add_out(new_leaves[i])
+ new_leaves[line] = child
+
+ leaves = new_leaves
+ nodes += leaves.values()
+ rank += 1
+
+ return nodes, submissions, queries
+
+# Return all interesting edit paths in the edit graph rooted at [root].
+def get_paths(root, path=tuple(), done=None):
+ if done is None:
+ done = set()
+
+ cur_path = list(path)
+ if len(path) == 0 or path[-1] != root.data[2]:
+ cur_path.append(root.data[2])
+
+ # leaf node
+ if len(root.eout) == 0:
+ yield tuple(cur_path)
+ # empty node
+ elif len(path) > 1 and len(root.data[2]) == 0:
+ yield tuple(cur_path)
+
+ if len(root.data[2]) > 0:
+ new_path = cur_path
+ else:
+ new_path = [root.data[2]]
+ done.add(root)
+
+ for node in root.eout:
+ if node not in done:
+ yield from get_paths(node, tuple(new_path), done)
+
+# Build an edit graph for each trace and find "meaningful" (to be defined)
+# edits. Return a dictionary of edits and their frequencies, and also
+# submissions and queries in [traces].
+def get_edits_from_traces(traces):
+ # Helper function to remove trailing punctuation from lines. This is a
+ # rather ugly performance-boosting hack.
+ def remove_punct(line):
+ if line and line[-1].type in ('COMMA', 'PERIOD', 'SEMI', 'FROM'):
+ return line[:-1]
+ return line
+
+ # Return values: counts for observed edits, lines, submissions and queries.
+ edits = collections.Counter()
+ lines = collections.Counter()
+ submissions = collections.Counter()
+ queries = collections.Counter()
+
+ for trace in traces:
+ try:
+ actions = parse(trace)
+ except:
+ continue
+ nodes, trace_submissions, trace_queries = edit_graph(actions)
+
+ # Update the submissions/queries counters; rename variables first to
+ # remove trivial differences.
+ for submission in trace_submissions:
+ tokens = tokenize(submission)
+ rename_vars(tokens)
+ code = stringify(tokens)
+ submissions[code] += 1
+
+ for query in trace_queries:
+ tokens = tokenize(query)
+ rename_vars(tokens)
+ code = stringify(tokens)
+ queries[code] += 1
+
+ # Get edits.
+ for path in get_paths(nodes[0]):
+ for i in range(len(path)):
+ start = list(remove_punct(path[i]))
+ var_names = rename_vars(start)
+ start_t = tuple(start)
+
+ for j in range(len(path[i+1:])):
+ end = list(remove_punct(path[i+1+j]))
+ rename_vars(end, var_names)
+ end_t = tuple(end)
+
+ if start_t != end_t:
+ edit = (start_t, end_t)
+ edits[edit] += 1
+ lines[start_t] += 1
+
+ # Discard rarely occurring edits. XXX only for testing
+ singletons = [edit for edit in edits if edits[edit] < 2]
+ for edit in singletons:
+ lines[edit[0]] -= edits[edit]
+ del edits[edit]
+
+ # Get the probability of each edit given its 'before' line.
+ for before, after in edits:
+ edits[(before, after)] /= lines[before]
+
+ # Normalize line frequencies.
+ if len(lines) > 0:
+ lines_max = max(lines.values())
+ lines = {line: count/lines_max for line, count in lines.items()}
+
+ return edits, lines, submissions, queries
+
+def classify_edits(edits):
+ inserts = {}
+ removes = {}
+ changes = {}
+ for (before, after), cost in edits.items():
+ if after and not before:
+ inserts[after] = cost
+ elif before and not after:
+ removes[before] = cost
+ else:
+ changes[(before, after)] = cost
+ return inserts, removes, changes
+
+# Simplify an edit graph with given nodes: remove empty leaf nodes and other
+# fluff. The function is called recursively until no more changes are done.
+def clean_graph(nodes):
+ changed = False
+
+ # A
+ # | --> A (when B is an empty leaf)
+ # B
+ for node in nodes:
+ if len(node.eout) == 0 and len(node.ein) == 1 and len(node.data[2]) == 0:
+ parent = node.ein[0]
+ parent.eout.remove(node)
+ nodes.remove(node)
+ changed = True
+ break
+
+ # A
+ # | --> A
+ # A
+ for node in nodes:
+ if len(node.ein) == 1:
+ parent = node.ein[0]
+ if len(parent.eout) == 1 and node.data[2] == parent.data[2]:
+ parent.eout = node.eout
+ for child in parent.eout:
+ child.ein = [parent if node == node else node for node in child.ein]
+ nodes.remove(node)
+ changed = True
+ break
+
+ # A A
+ # |\ |
+ # | C --> | (when C is empty)
+ # |/ |
+ # B B
+ for node in nodes:
+ if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1:
+ parent = node.ein[0]
+ child = node.eout[0]
+ if len(parent.eout) == 2 and len(child.ein) == 2:
+ parent.eout = [n for n in parent.eout if n != node]
+ child.ein = [n for n in child.ein if n != node]
+ nodes.remove(node)
+ changed = True
+ break
+
+ # A
+ # |
+ # C --> A
+ # |
+ # A
+ for node in nodes:
+ if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1:
+ parent = node.ein[0]
+ child = node.eout[0]
+ if len(parent.eout) == 1 and len(child.ein) == 1 and parent.data[2] == child.data[2]:
+ parent.eout = [child]
+ child.ein = [parent]
+ nodes.remove(node)
+ changed = True
+ break
+
+ if changed:
+ # go again until nothing changes
+ clean_graph(nodes)
+ else:
+ # compact node ranks
+ ranks = set([node.data[0] for node in nodes])
+ missing = set(range(1,max(ranks)+1)) - ranks
+
+ for node in nodes:
+ diff = 0
+ for rank in sorted(missing):
+ if rank >= node.data[0]:
+ break
+ diff += 1
+ node.data[0] -= diff