1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
import argparse
import ast
import collections
import os
import pandas
import sklearn.dummy
import sklearn.ensemble
import sklearn.model_selection
import sklearn.tree
import canonicalize
import dynamic
import regex
def get_programs(path: str, names: str, do_canonicalize: bool = False):
"""Load progams from path."""
programs = {}
users = sorted([int(uid) for uid in os.listdir(path) if uid.isdigit()])
for user in users:
user_dir = os.path.join(path, str(user), 'submissions')
user_subs = set()
for submission in os.listdir(user_dir):
with open(os.path.join(user_dir, submission), 'r') as f:
try:
code = f.read().strip()
except:
continue
# skip syntactically incorrect programs
try:
ast.parse(code)
except:
continue
# canonicalize
original = code
if do_canonicalize:
code = canonicalize.canonicalize(code, given_names=names)
# remember
if code in user_subs:
continue
user_subs.add(code)
seq, total, passed = submission.split('-')
if code not in programs:
programs[code] = {'users': set(), 'correct': total == passed, 'original': original}
programs[code]['users'].add(user)
return programs
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get patterns from student programs.')
parser.add_argument('path', help='path to data directory')
parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized')
parser.add_argument('--exec', required=False, help='code to append for dynamic patterns')
parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns')
args = parser.parse_args()
path = args.path.rstrip('/')
problem_name = os.path.basename(path)
programs = get_programs(path, args.names, do_canonicalize=True)
attrs = collections.OrderedDict()
attrs.update(regex.get_attributes(programs))
attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs))
#print('Attributes:')
#for attr in attrs:
# print(attr, attrs[attr]['desc'].to_string(inline=True))
for program in programs:
for attr in attrs:
programs[program][attr] = program in attrs[attr]['programs']
data = pandas.DataFrame.from_dict(programs, orient='index')
train = data.sample(frac=0.7, random_state=0)
Y = train['correct']
X = train.drop(['users', 'correct', 'original'], axis='columns')
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.33, random_state=0)
learners = collections.OrderedDict([
('major', sklearn.dummy.DummyClassifier()),
('tree', sklearn.tree.DecisionTreeClassifier()),
('rf', sklearn.ensemble.RandomForestClassifier(n_estimators=100)),
])
for name, learner in learners.items():
scores = sklearn.model_selection.cross_val_score(learner, X, Y, cv=10)
print('{}:\t{}'.format(name, scores.mean()))
|