1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
# CodeQ: an online programming tutor.
# Copyright (C) 2015 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import collections
import math
from .action import expand, parse
from .graph import Node
from prolog.util import Token, normalized, parse as prolog_parse, rename_vars, rename_vars_ast, rename_vars_list, interesting_ranges, stringify, tokenize
from .util import get_line, avg, logistic
def get_edits_from_trace(trace, test, id):
submissions = set() # Program versions at 'test' actions.
queries = set() # Queries run by the student.
# For each observed edit, store a list of features (with repeats) of ASTs
# where they were observed.
edits = collections.defaultdict(list)
def add_edit(path, start, end, tree):
if start == end:
return
edits[(path, start, end)].append(id)
# Parse trace actions and ensure there is a separate action for each
# inserted/removed character.
try:
actions = parse(trace)
expand(actions)
except:
# Only a few traces fail to parse, so just skip them.
actions = []
# State variables.
open_edits = []
code_next = '' # Program code after applying the current action.
done = False # Set to True on first correct version.
prev_tree = None
prev_action = None
for action_id, action in enumerate(actions):
code = code_next
code_next = action.apply(code)
if action.type in {'solve', 'solve_all', 'test'}:
if action.type == 'solve' or action.type == 'solve_all':
queries.add(action.query.rstrip(' .'))
elif action.type == 'test':
correct = test(code)
submissions.add((code, correct))
if correct:
# Ignore actions after the first correct version.
done = True
break
tree = prolog_parse(code)
if tree and tree.leaves() and tree != prev_tree:
for terminals, path in interesting_ranges(tree):
pos_start = terminals[0].pos
pos_end = terminals[-1].pos + len(terminals[-1].val)
# If there is an open edit with the same range, don't add a new one.
found = False
for e_start_tree, e_start_tokens, e_path, e_pos_start, e_pos_end in open_edits:
if e_pos_start == pos_start and e_pos_end == pos_end:
found = True
break
if not found:
#print('OPENING {}'.format(terminals))
open_edits.append([tree, terminals, path, pos_start, pos_end])
prev_tree = tree
if action.type in {'insert', 'remove'}:
new_open_edits = []
for start_tree, start_tokens, path, pos_start, pos_end in open_edits:
new_pos_start, new_pos_end = pos_start, pos_end
if action.type == 'remove':
if action.offset < pos_end:
new_pos_end -= 1
if action.offset < pos_start:
new_pos_start -= 1
elif action.type == 'insert':
if action.offset < pos_start:
new_pos_start += 1
new_pos_end += 1
elif action.offset == pos_start:
new_pos_end += 1
elif action.offset < pos_end:
new_pos_end += 1
elif action.offset == pos_end:
if (prev_action is None or
prev_action.type == 'insert' and prev_action.offset == action.offset-1 or
prev_action.type == 'remove' and prev_action.offset == action.offset-1):
orig_next = None
for terminal in start_tree.leaves():
if terminal.pos >= start_tokens[-1].pos + len(start_tokens[-1].val):
orig_next = terminal
break
if not (orig_next and orig_next.val[0] == action.text):
new_pos_end += 1
pass
if new_pos_start != new_pos_end:
new_open_edits.append([start_tree, start_tokens, path, new_pos_start, new_pos_end])
open_edits = new_open_edits
prev_action = action
if done:
for start_tree, start_tokens, path, pos_start, pos_end in open_edits:
end_tokens = tokenize(code[pos_start:pos_end])
names = {}
start_normal = rename_vars_list(start_tokens, names)
end_normal = rename_vars_list(end_tokens, names)
norm_tree = rename_vars_ast(start_tree, names)
add_edit(path, tuple(start_normal), tuple(end_normal), norm_tree)
return edits, submissions, queries
def get_edits_from_solutions(solutions, test):
# For each observed edit, store a list of features (with repeats) of ASTs
# where they were observed.
submissions = collections.Counter()
queries = collections.Counter()
edits = collections.defaultdict(list)
for solution in solutions:
trace = solution.trace
uid = solution.codeq_user_id
trace_edits, trace_submissions, trace_queries = get_edits_from_trace(trace, test, uid)
# Update edits.
for edit, features in trace_edits.items():
edits[edit].extend(features)
# Update submission/query counters (use normalized variables).
for code, correct in trace_submissions:
code = stringify(rename_vars(tokenize(code)))
submissions[(code, correct)] += 1
for query in trace_queries:
code = stringify(rename_vars(tokenize(query)))
queries[code] += 1
# Discard edits that only occur in one trace.
singletons = [edit for edit in edits if len(edits[edit]) < 2]
for edit in singletons:
del edits[edit]
n_start = collections.Counter()
n_start_all = 0
for (path, a, b), features in edits.items():
edits[(path, a, b)] = len(features)
n_start[(path, a)] += len(features)
n_start_all += len(features)
# Find the probability of each edit a → b.
new_edits = {}
for (path, a, b), count in edits.items():
if a != b:
p = count / n_start[(path, a)]
new_edits[(path, a, b)] = p
edits = new_edits
# Tweak the edit distribution to improve search.
avg_p = avg(edits.values())
for edit, p in edits.items():
edits[edit] = logistic(p, k=3, x_0=avg_p)
return edits, submissions, queries
def classify_edits(edits):
inserts = {}
removes = {}
changes = {}
for (before, after), cost in edits.items():
if after and not before:
inserts[after] = cost
elif before and not after:
removes[before] = cost
else:
changes[(before, after)] = cost
return inserts, removes, changes
# Extract edits and other data from existing traces for each problem.
if __name__ == '__main__':
import pickle
from db.models import CodeqUser, Problem, Solution
import server.problems
from termcolor import colored
# Ignore traces from these users.
ignored_users = [
1, # admin
231, # test
360, # test2
358, # sasha
]
edits, submissions, queries = {}, {}, {}
for problem in Problem.list():
pid = problem.id
solutions = [s for s in Solution.filter(problem_id=pid, done=True)
if s.codeq_user_id not in ignored_users]
if not solutions:
print('No traces for {}'.format(problem.identifier))
continue
# Testing function.
solved_problems = [p for p in CodeqUser.solved_problems(1, problem.language)
if p != (problem.group, problem.identifier)]
other_solutions = server.problems.solutions_for_problems(problem.language, solved_problems)
problem_module = server.problems.load_problem(problem.language, problem.group, problem.identifier, 'common')
def test(code):
correct, hints = problem_module.test(code, solved_problems)
return correct
print('Analyzing traces for {}… '.format(problem.identifier), end='', flush=True)
print('{} traces… '.format(len(solutions)), end='', flush=True)
try:
edits[pid], submissions[pid], queries[pid] = get_edits_from_solutions(solutions, test)
print('{} edits, {} submissions, {} queries'.format(
len(edits[pid]), len(submissions[pid]), len(queries[pid])))
except Exception as ex:
import traceback
traceback.print_exc()
pickle.dump((edits, submissions, queries), open('edits.pickle', 'wb'))
|