summaryrefslogtreecommitdiff
path: root/monkey/edits.py
diff options
context:
space:
mode:
Diffstat (limited to 'monkey/edits.py')
-rw-r--r--monkey/edits.py332
1 files changed, 142 insertions, 190 deletions
diff --git a/monkey/edits.py b/monkey/edits.py
index 17f13d3..7744fa2 100644
--- a/monkey/edits.py
+++ b/monkey/edits.py
@@ -19,26 +19,20 @@ import math
from .action import expand, parse
from .graph import Node
-from prolog.util import annotate, normalized, rename_vars, stringify, tokenize
+from prolog.util import Token, normalized, parse as prolog_parse, rename_vars, rename_vars_ast2, rename_vars_list, interesting_ranges, stringify, tokenize
from .util import get_line, avg, logistic
-# Parse the sequence of actions in [trace] and return a directed acyclic graph
-# representing development history. Each node represents a particular version
-# of some line, and each edge represents a "line edit" (contiguous sequence of
-# inserts/removes within a single line).
-# Return a list of nodes with root as the first element. Also return sets of
-# submissions (user-tested program versions) and queries in this attempt.
-def trace_graph(trace):
- # Return values.
- nodes = [Node([0, 0, ()])] # Node data: rank (Y), line no. (X), and tokens.
+def get_edits_from_trace(trace, test, id):
submissions = set() # Program versions at 'test' actions.
queries = set() # Queries run by the student.
- # State variables.
- leaves = {0: nodes[0]} # Current leaf node for each line.
- rank = 1 # Rank (order / y-position) for the next node.
- code_next = '' # Program code after applying the current action.
- done = False # Set to True on first correct version.
+ # For each observed edit, store a list of features (with repeats) of ASTs
+ # where they were observed.
+ edits = collections.defaultdict(list)
+ def add_edit(path, start, end, tree):
+ if start == end:
+ return
+ edits[(path, start, end)].append(id)
# Parse trace actions and ensure there is a separate action for each
# inserted/removed character.
@@ -49,194 +43,139 @@ def trace_graph(trace):
# Only a few traces fail to parse, so just skip them.
actions = []
+ # State variables.
+ open_edits = []
+ code_next = '' # Program code after applying the current action.
+ done = False # Set to True on first correct version.
+ prev_tree = None
+ prev_action = None
+
for action_id, action in enumerate(actions):
code = code_next
code_next = action.apply(code)
- if action.type == 'test':
- submissions.add((code, action.total == action.passed))
- if action.total == action.passed:
- done = True
-
- elif action.type == 'solve' or action.type == 'solve_all':
- queries.add(action.query.rstrip(' .'))
-
- elif action.type == 'insert' or action.type == 'remove':
- # Ignore edits after the first correct version.
- if done:
- continue
-
- # Number of the changed line.
- line = code[:action.offset].count('\n')
- # Current leaf node for this line.
- parent = leaves[line]
- # Tokens in this line after applying [action].
- tokens_next = tuple(tokenize(get_line(code_next, line)))
-
- # If a new node is inserted, clone each leaf into the next rank.
- # This makes it easier to print the graph for graphviz; when
- # analyzing the graph, duplicate nodes without siblings should be
- # ignored.
- new_leaves = {}
-
- if action.text == '\n':
- if action.type == 'insert':
- tokens_next_right = tuple(tokenize(get_line(code_next, line+1)))
-
- child_left = Node([rank, line, tokens_next])
- parent.add_out(child_left)
-
- child_right = Node([rank, line+1, tokens_next_right])
- parent.add_out(child_right)
-
- # Create new leaf nodes.
- for i, leaf in leaves.items():
- if i < line:
- new_leaves[i] = Node([rank, i, leaf.data[2]])
- leaf.add_out(new_leaves[i])
- elif i > line:
- new_leaves[i+1] = Node([rank, i+1, leaf.data[2]])
- leaf.add_out(new_leaves[i+1])
- new_leaves[line] = child_left
- new_leaves[line+1] = child_right
-
- elif action.type == 'remove':
- parent_right = leaves[line+1]
-
- child = Node([rank, line, tokens_next])
- parent_right.add_out(child)
- parent.add_out(child)
-
- # Create new leaf nodes.
- for i, leaf in leaves.items():
- if i < line:
- new_leaves[i] = Node([rank, i, leaf.data[2]])
- leaf.add_out(new_leaves[i])
- elif i > line+1:
- new_leaves[i-1] = Node([rank, i-1, leaf.data[2]])
- leaf.add_out(new_leaves[i-1])
- new_leaves[line] = child
- else:
- # Skip the node if the next action is insert/remove (except \n)
- # on the same line.
- if action_id < len(actions)-1:
- action_next = actions[action_id+1]
- if action_next.type in ('insert', 'remove'):
- line_next = code_next[:action_next.offset].count('\n')
- if action_next.text != '\n' and line == line_next:
- continue
-
- # Skip the node if it is the same as the parent.
- if tokens_next == parent.data[2]:
- continue
-
- child = Node([rank, line, tokens_next])
- parent.add_out(child)
-
- # Create new leaf nodes.
- for i, leaf in leaves.items():
- if i != line:
- new_leaves[i] = Node([rank, i, leaf.data[2]])
- leaf.add_out(new_leaves[i])
- new_leaves[line] = child
-
- leaves = new_leaves
- nodes += leaves.values()
- rank += 1
-
- return nodes, submissions, queries
-
-# Return a set of edits that appear in the trace_graph given by [nodes].
-def graph_edits(nodes):
- edits = set()
- for a in nodes:
- a_data = a.data[2]
- for b in a.eout:
- b_data = b.data[2]
-
- # Normalize a → b into start → end. Reuse variable names from a
- # when normalizing b.
- var_names = {}
- start = normalized(a_data, var_names)
- end = normalized(b_data, var_names)
-
- if start == end:
- continue
-
- # An edit start → ε happens each time the user inserts \n; ignore
- # such cases.
- if not end and len(a.eout) > 1:
- continue
-
- # Skip edits where start/end is composed of more than one part.
- # TODO improve trace_graph to handle this instead.
- if [t for t in annotate(stringify(start)) if t.part > 0]:
- continue
- if [t for t in annotate(stringify(end)) if t.part > 0]:
- continue
-
- edits.add((start, end))
- return edits
-
-# Build an edit graph for each trace and find "meaningful" (to be defined)
-# edits. Return a dictionary of edits and their frequencies, and also
-# submissions and queries in [traces].
-def get_edits_from_traces(traces):
- # Return values: counts for observed edits, lines, submissions and queries.
- edits = collections.Counter()
+ if action.type in {'solve', 'solve_all', 'test'}:
+ if action.type == 'solve' or action.type == 'solve_all':
+ queries.add(action.query.rstrip(' .'))
+ elif action.type == 'test':
+ correct = test(code)
+ submissions.add((code, correct))
+ if correct:
+ # Ignore actions after the first correct version.
+ done = True
+ break
+
+ tree = prolog_parse(code)
+ if tree and tree.leaves() and tree != prev_tree:
+ for terminals, path in interesting_ranges(tree):
+ pos_start = terminals[0].pos
+ pos_end = terminals[-1].pos + len(terminals[-1].val)
+ # If there is an open edit with the same range, don't add a new one.
+ found = False
+ for e_start_tree, e_start_tokens, e_path, e_pos_start, e_pos_end in open_edits:
+ if e_pos_start == pos_start and e_pos_end == pos_end:
+ found = True
+ break
+ if not found:
+ #print('OPENING {}'.format(terminals))
+ open_edits.append([tree, terminals, path, pos_start, pos_end])
+ prev_tree = tree
+
+ if action.type in {'insert', 'remove'}:
+ new_open_edits = []
+ for start_tree, start_tokens, path, pos_start, pos_end in open_edits:
+ new_pos_start, new_pos_end = pos_start, pos_end
+ if action.type == 'remove':
+ if action.offset < pos_end:
+ new_pos_end -= 1
+ if action.offset < pos_start:
+ new_pos_start -= 1
+ elif action.type == 'insert':
+ if action.offset < pos_start:
+ new_pos_start += 1
+ new_pos_end += 1
+ elif action.offset == pos_start:
+ new_pos_end += 1
+ elif action.offset < pos_end:
+ new_pos_end += 1
+ elif action.offset == pos_end:
+ if (prev_action is None or
+ prev_action.type == 'insert' and prev_action.offset == action.offset-1 or
+ prev_action.type == 'remove' and prev_action.offset == action.offset-1):
+ orig_next = None
+ for terminal in start_tree.leaves():
+ if terminal.pos >= start_tokens[-1].pos + len(start_tokens[-1].val):
+ orig_next = terminal
+ break
+ if not (orig_next and orig_next.val[0] == action.text):
+ new_pos_end += 1
+ pass
+ if new_pos_start != new_pos_end:
+ new_open_edits.append([start_tree, start_tokens, path, new_pos_start, new_pos_end])
+ open_edits = new_open_edits
+ prev_action = action
+
+ if done:
+ for start_tree, start_tokens, path, pos_start, pos_end in open_edits:
+ end_tokens = tokenize(code[pos_start:pos_end])
+ names = {}
+ start_normal = rename_vars_list(start_tokens, names)
+ end_normal = rename_vars_list(end_tokens, names)
+ norm_tree = rename_vars_ast2(start_tree, names)
+ add_edit(path, tuple(start_normal), tuple(end_normal), norm_tree)
+
+ return edits, submissions, queries
+
+def get_edits_from_solutions(solutions, test):
+ # For each observed edit, store a list of features (with repeats) of ASTs
+ # where they were observed.
submissions = collections.Counter()
queries = collections.Counter()
- names = collections.Counter()
+ edits = collections.defaultdict(list)
+
+ for solution in solutions:
+ trace = solution.trace
+ uid = solution.codeq_user_id
+ trace_edits, trace_submissions, trace_queries = get_edits_from_trace(trace, test, uid)
- # Counts of traces where each line appears as a leaf / any node.
- n_leaf = collections.Counter()
- n_all = collections.Counter()
+ # Update edits.
+ for edit, features in trace_edits.items():
+ edits[edit].extend(features)
- for trace in traces:
- nodes, trace_submissions, trace_queries = trace_graph(trace)
- counted_tokens = []
- # Update the submissions/queries counters (use normalized variables).
- for (submission, correct) in trace_submissions:
- if correct:
- tokens = list(tokenize(submission))
- for token in tokens:
- if token.type == 'NAME' and token.val not in counted_tokens:
- names[token.val] += 1
- counted_tokens.append(token.val)
- code = stringify(rename_vars(tokens))
- submissions[code] += 1
+ # Update submission/query counters (use normalized variables).
+ for code, correct in trace_submissions:
+ code = stringify(rename_vars(tokenize(code)))
+ submissions[(code, correct)] += 1
for query in trace_queries:
code = stringify(rename_vars(tokenize(query)))
queries[code] += 1
- # Update the edit and leaf/node counters.
- edits.update(graph_edits(nodes))
- n_leaf.update(set([normalized(n.data[2]) for n in nodes if n.data[2] and not n.eout]))
- n_all.update(set([normalized(n.data[2]) for n in nodes if n.data[2]]))
-
# Discard edits that only occur in one trace.
- singletons = [edit for edit in edits if edits[edit] < 2]
+ singletons = [edit for edit in edits if len(edits[edit]) < 2]
for edit in singletons:
del edits[edit]
+ n_start = collections.Counter()
+ n_start_all = 0
+ for (path, a, b), features in edits.items():
+ edits[(path, a, b)] = len(features)
+ n_start[(path, a)] += len(features)
+ n_start_all += len(features)
+
# Find the probability of each edit a → b.
- for (a, b), count in edits.items():
- p = 1.0
- if a:
- p *= 1 - (n_leaf[a] / (n_all[a]+1))
- if b:
- b_normal = normalized(b)
- p *= n_leaf[b_normal] / (n_all[b_normal]+1)
- if a and b:
- p = math.sqrt(p)
- edits[(a, b)] = p
+ new_edits = {}
+ for (path, a, b), count in edits.items():
+ if a != b:
+ p = count / n_start[(path, a)]
+ new_edits[(path, a, b)] = p
+ edits = new_edits
# Tweak the edit distribution to improve search.
avg_p = avg(edits.values())
for edit, p in edits.items():
edits[edit] = logistic(p, k=3, x_0=avg_p)
- return edits, submissions, queries, names
+ return edits, submissions, queries
def classify_edits(edits):
inserts = {}
@@ -255,7 +194,9 @@ def classify_edits(edits):
# Extract edits and other data from existing traces for each problem.
if __name__ == '__main__':
import pickle
- from db.models import Problem, Solution
+ from db.models import CodeqUser, Problem, Solution
+ import server.problems
+ from termcolor import colored
# Ignore traces from these users.
ignored_users = [
@@ -265,21 +206,32 @@ if __name__ == '__main__':
358, # sasha
]
- edits, submissions, queries, names = {}, {}, {}, {}
+ edits, submissions, queries = {}, {}, {}
for problem in Problem.list():
pid = problem.id
- solutions = Solution.filter(problem_id=pid, done=True)
- traces = [s.trace for s in solutions if s.codeq_user_id not in ignored_users]
- if not traces:
+ solutions = [s for s in Solution.filter(problem_id=pid, done=True)
+ if s.codeq_user_id not in ignored_users]
+ if not solutions:
print('No traces for {}'.format(problem.identifier))
continue
+ # Testing function.
+ solved_problems = [p for p in CodeqUser.solved_problems(1, problem.language)
+ if p != (problem.group, problem.identifier)]
+ other_solutions = server.problems.solutions_for_problems(problem.language, solved_problems)
+ problem_module = server.problems.load_problem(problem.language, problem.group, problem.identifier, 'common')
+ def test(code):
+ correct, hints = problem_module.test(code, solved_problems)
+ return correct
+
print('Analyzing traces for {}… '.format(problem.identifier), end='', flush=True)
+ print('{} traces… '.format(len(solutions)), end='', flush=True)
try:
- edits[pid], submissions[pid], queries[pid], names[pid] = get_edits_from_traces(traces)
- print('{} edits, {} submissions, {} queries, {} names'.format(
- len(edits[pid]), len(submissions[pid]), len(queries[pid]), len(names[pid])))
+ edits[pid], submissions[pid], queries[pid] = get_edits_from_solutions(solutions, test)
+ print('{} edits, {} submissions, {} queries'.format(
+ len(edits[pid]), len(submissions[pid]), len(queries[pid])))
except Exception as ex:
- print('error: {}'.format(ex))
+ import traceback
+ traceback.print_exc()
- pickle.dump((edits, submissions, queries, names), open('edits.pickle', 'wb'))
+ pickle.dump((edits, submissions, queries), open('edits.pickle', 'wb'))