Move pymonkey stuff to monkey/

Importing pymonkey into webmonkey, let's see how this works.
author: Timotej Lazar <timotej.lazar@araneo.org> 2015-01-15 12:10:22 +0100
committer: Aleš Smodiš <aless@guru.si> 2015-08-11 14:26:01 +0200
commit: 819ab10281c9bd6c000364c3a243959edd18abf7 (patch)
tree: 5ca3452418b49781563221bb56cf70e1d0fb1bb8 /monkey/edits.py
parent: d86793039957aa408a98806aecfb5964bda5fb87 (diff)
1 files changed, 307 insertions, 0 deletions
diff --git a/monkey/edits.py b/monkey/edits.py
new file mode 100644
index 0000000..fef591a
--- /dev/null
+++ b/monkey/edits.py
@@ -0,0 +1,307 @@
+#!/usr/bin/python3
+
+import collections
+
+from action import expand, parse
+from graph import Node
+from prolog.util import rename_vars, stringify, tokenize
+from util import get_line
+
+# A line edit is a contiguous sequences of actions within a single line. This
+# function takes a sequence of actions and builds a directed acyclic graph
+# where each edit represents one line edit and each node represents a version
+# of some line. The function returns a list of nodes (first element is the
+# root), and sets of submissions (program versions tested by the user) and
+# queries in this attempt.
+def edit_graph(actions, debug=False):
+    # Return values.
+    nodes = [Node([0, 0, ()])]  # Node data: rank (Y), line no. (X), and tokens.
+    submissions = set()         # Program versions at 'test' actions.
+    queries = set()             # Queries run by the student.
+
+    # State variables.
+    leaves = {0: nodes[0]}  # Current leaf node for each line.
+    rank = 1                # Rank (order / y-position) for the next node.
+    code_next = ''          # Program code after applying the current action.
+
+    # Ensure there is a separate action for each inserted/removed character.
+    expand(actions)
+    for action_id, action in enumerate(actions):
+        code = code_next
+        code_next = action.apply(code)
+
+        if action.type == 'test':
+            submissions.add(code)
+
+        elif action.type == 'solve' or action.type == 'solve_all':
+            queries.add(action.query)
+
+        elif action.type == 'insert' or action.type == 'remove':
+            # Number of the changed line.
+            line = code[:action.offset].count('\n')
+            # Current leaf node for this line.
+            parent = leaves[line]
+            # Tokens in this line after applying [action].
+            tokens_next = tuple(tokenize(get_line(code_next, line)))
+
+            # If a new node is inserted, clone each leaf into the next rank.
+            # This makes it easier to print the graph for graphviz; when
+            # analyzing the graph, duplicate nodes without siblings should be
+            # ignored.
+            new_leaves = {}
+
+            if action.text == '\n':
+                if action.type == 'insert':
+                    tokens_next_right = tuple(tokenize(get_line(code_next, line+1)))
+
+                    child_left = Node([rank, line, tokens_next])
+                    parent.add_out(child_left)
+
+                    child_right = Node([rank, line+1, tokens_next_right])
+                    parent.add_out(child_right)
+
+                    # Create new leaf nodes.
+                    for i, leaf in leaves.items():
+                        if i < line:
+                            new_leaves[i] = Node([rank, i, leaf.data[2]])
+                            leaf.add_out(new_leaves[i])
+                        elif i > line:
+                            new_leaves[i+1] = Node([rank, i+1, leaf.data[2]])
+                            leaf.add_out(new_leaves[i+1])
+                    new_leaves[line] = child_left
+                    new_leaves[line+1] = child_right
+
+                elif action.type == 'remove':
+                    parent_right = leaves[line+1]
+
+                    child = Node([rank, line, tokens_next])
+                    parent_right.add_out(child)
+                    parent.add_out(child)
+
+                    # Create new leaf nodes.
+                    for i, leaf in leaves.items():
+                        if i < line:
+                            new_leaves[i] = Node([rank, i, leaf.data[2]])
+                            leaf.add_out(new_leaves[i])
+                        elif i > line+1:
+                            new_leaves[i-1] = Node([rank, i-1, leaf.data[2]])
+                            leaf.add_out(new_leaves[i-1])
+                    new_leaves[line] = child
+            else:
+                # Skip the node if the next action is insert/remove (except \n)
+                # on the same line.
+                if action_id < len(actions)-1:
+                    action_next = actions[action_id+1]
+                    if action_next.type in ('insert', 'remove'):
+                        line_next = code_next[:action_next.offset].count('\n')
+                        if action_next.text != '\n' and line == line_next:
+                            continue
+
+                # Skip the node if it is the same as the parent.
+                if tokens_next == parent.data[2]:
+                    continue
+
+                child = Node([rank, line, tokens_next])
+                parent.add_out(child)
+
+                # Create new leaf nodes.
+                for i, leaf in leaves.items():
+                    if i != line:
+                        new_leaves[i] = Node([rank, i, leaf.data[2]])
+                        leaf.add_out(new_leaves[i])
+                new_leaves[line] = child
+
+            leaves = new_leaves
+            nodes += leaves.values()
+            rank += 1
+
+    return nodes, submissions, queries
+
+# Return all interesting edit paths in the edit graph rooted at [root].
+def get_paths(root, path=tuple(), done=None):
+    if done is None:
+        done = set()
+
+    cur_path = list(path)
+    if len(path) == 0 or path[-1] != root.data[2]:
+        cur_path.append(root.data[2])
+
+    # leaf node
+    if len(root.eout) == 0:
+        yield tuple(cur_path)
+    # empty node
+    elif len(path) > 1 and len(root.data[2]) == 0:
+        yield tuple(cur_path)
+
+    if len(root.data[2]) > 0:
+        new_path = cur_path
+    else:
+        new_path = [root.data[2]]
+        done.add(root)
+
+    for node in root.eout:
+        if node not in done:
+            yield from get_paths(node, tuple(new_path), done)
+
+# Build an edit graph for each trace and find "meaningful" (to be defined)
+# edits. Return a dictionary of edits and their frequencies, and also
+# submissions and queries in [traces].
+def get_edits_from_traces(traces):
+    # Helper function to remove trailing punctuation from lines. This is a
+    # rather ugly performance-boosting hack.
+    def remove_punct(line):
+        if line and line[-1].type in ('COMMA', 'PERIOD', 'SEMI', 'FROM'):
+            return line[:-1]
+        return line
+
+    # Return values: counts for observed edits, lines, submissions and queries.
+    edits = collections.Counter()
+    lines = collections.Counter()
+    submissions = collections.Counter()
+    queries = collections.Counter()
+
+    for trace in traces:
+        try:
+            actions = parse(trace)
+        except:
+            continue
+        nodes, trace_submissions, trace_queries = edit_graph(actions)
+
+        # Update the submissions/queries counters; rename variables first to
+        # remove trivial differences.
+        for submission in trace_submissions:
+            tokens = tokenize(submission)
+            rename_vars(tokens)
+            code = stringify(tokens)
+            submissions[code] += 1
+
+        for query in trace_queries:
+            tokens = tokenize(query)
+            rename_vars(tokens)
+            code = stringify(tokens)
+            queries[code] += 1
+
+        # Get edits.
+        for path in get_paths(nodes[0]):
+            for i in range(len(path)):
+                start = list(remove_punct(path[i]))
+                var_names = rename_vars(start)
+                start_t = tuple(start)
+
+                for j in range(len(path[i+1:])):
+                    end = list(remove_punct(path[i+1+j]))
+                    rename_vars(end, var_names)
+                    end_t = tuple(end)
+
+                    if start_t != end_t:
+                        edit = (start_t, end_t)
+                        edits[edit] += 1
+                        lines[start_t] += 1
+
+    # Discard rarely occurring edits. XXX only for testing
+    singletons = [edit for edit in edits if edits[edit] < 2]
+    for edit in singletons:
+        lines[edit[0]] -= edits[edit]
+        del edits[edit]
+
+    # Get the probability of each edit given its 'before' line.
+    for before, after in edits:
+        edits[(before, after)] /= lines[before]
+
+    # Normalize line frequencies.
+    if len(lines) > 0:
+        lines_max = max(lines.values())
+        lines = {line: count/lines_max for line, count in lines.items()}
+
+    return edits, lines, submissions, queries
+
+def classify_edits(edits):
+    inserts = {}
+    removes = {}
+    changes = {}
+    for (before, after), cost in edits.items():
+        if after and not before:
+            inserts[after] = cost
+        elif before and not after:
+            removes[before] = cost
+        else:
+            changes[(before, after)] = cost
+    return inserts, removes, changes
+
+# Simplify an edit graph with given nodes: remove empty leaf nodes and other
+# fluff. The function is called recursively until no more changes are done.
+def clean_graph(nodes):
+    changed = False
+
+    # A
+    # | --> A (when B is an empty leaf)
+    # B
+    for node in nodes:
+        if len(node.eout) == 0 and len(node.ein) == 1 and len(node.data[2]) == 0:
+            parent = node.ein[0]
+            parent.eout.remove(node)
+            nodes.remove(node)
+            changed = True
+            break
+
+    # A
+    # | --> A
+    # A
+    for node in nodes:
+        if len(node.ein) == 1:
+            parent = node.ein[0]
+            if len(parent.eout) == 1 and node.data[2] == parent.data[2]:
+                parent.eout = node.eout
+                for child in parent.eout:
+                    child.ein = [parent if node == node else node for node in child.ein]
+                nodes.remove(node)
+                changed = True
+                break
+
+    # A       A
+    # |\      |
+    # | C --> |   (when C is empty)
+    # |/      |
+    # B       B
+    for node in nodes:
+        if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1:
+            parent = node.ein[0]
+            child = node.eout[0]
+            if len(parent.eout) == 2 and len(child.ein) == 2:
+                parent.eout = [n for n in parent.eout if n != node]
+                child.ein = [n for n in child.ein if n != node]
+                nodes.remove(node)
+                changed = True
+                break
+
+    # A
+    # |
+    # C --> A
+    # |
+    # A
+    for node in nodes:
+        if len(node.data[2]) == 0 and len(node.ein) == 1 and len(node.eout) == 1:
+            parent = node.ein[0]
+            child = node.eout[0]
+            if len(parent.eout) == 1 and len(child.ein) == 1 and parent.data[2] == child.data[2]:
+                parent.eout = [child]
+                child.ein = [parent]
+                nodes.remove(node)
+                changed = True
+                break
+
+    if changed:
+        # go again until nothing changes
+        clean_graph(nodes)
+    else:
+        # compact node ranks
+        ranks = set([node.data[0] for node in nodes])
+        missing = set(range(1,max(ranks)+1)) - ranks
+
+        for node in nodes:
+            diff = 0
+            for rank in sorted(missing):
+                if rank >= node.data[0]:
+                    break
+                diff += 1
+            node.data[0] -= diff
author	Timotej Lazar <timotej.lazar@araneo.org>	2015-01-15 12:10:22 +0100
committer	Aleš Smodiš <aless@guru.si>	2015-08-11 14:26:01 +0200
commit	819ab10281c9bd6c000364c3a243959edd18abf7 (patch)
tree	5ca3452418b49781563221bb56cf70e1d0fb1bb8 /monkey/edits.py
parent	d86793039957aa408a98806aecfb5964bda5fb87 (diff)