1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
import argparse
import collections
import os
import random
from monkey.patterns import get_patterns
from prolog.util import parse as prolog_parse
parser = argparse.ArgumentParser(description='Get patterns from student programs.')
parser.add_argument('path', help='path to data directory')
args = parser.parse_args()
path = args.path.rstrip('/')
name = os.path.basename(path)
submissions = os.path.join(path, 'submissions')
# select test/train users
users = sorted([int(uid) for uid in os.listdir(submissions)])
random.Random(0).shuffle(users)
split = int(len(users)*0.7)
learn_users = set(users[:split])
test_users = set(users[split:])
# save test users to file
with open(path + '/users-test.txt', 'wt') as f:
for user in test_users:
print(user, file=f)
# find test/train programs
data = {
'train': [],
'test': []
}
for user in users:
user_dir = os.path.join(submissions, str(user))
user_subs = set()
# each submission is in a file named <seq. no>-<total tests>-<passed tests>
for submission in os.listdir(user_dir):
with open(os.path.join(user_dir, submission), 'r') as f:
code = f.read()
if code in user_subs: # do not add a program twice for the same user
continue
user_subs.add(code)
if prolog_parse(code) is None: # skip syntactically incorrect programs
continue
if name not in code: # only add programs with defined predicate
continue
seq, total, passed = submission.split('-')
data['test' if user in test_users else 'train'].append((code, total == passed))
# print info about test users and test/train programs
print('Test users:')
print(test_users)
print()
for which in ['train', 'test']:
print('Programs ({}):'.format(which))
print('correct: {} ({} unique)'.format(
len([code for code, correct in data[which] if correct]),
len({code for code, correct in data[which] if correct})))
print('incorrect: {} ({} unique)'.format(
len([code for code, correct in data[which] if not correct]),
len({code for code, correct in data[which] if not correct})))
print()
# extract attributes from training data
patterns = collections.Counter()
for code, correct in data['train']:
for pat, nodes in get_patterns(code):
patterns[pat] += 1
attrs = []
with open(path + '/attributes.tab', 'w') as pattern_file:
for i, (pat, count) in enumerate(patterns.most_common()):
if count < 5:
break
attrs.append(pat)
print('a{}\t{}'.format(i, pat), file=pattern_file)
# check and write attributes for training/test data
for t in ['train', 'test']:
with open(path + '/programs-{}.tab'.format(t), 'w') as f:
# print header
print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
print('\t'.join(['d'] * (len(attrs)+2)), file=f)
print('meta\tclass', file=f)
# print rows (program, correct, attr1, attr2, …)
for code, correct in data[t]:
record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
code_pats = [pat for pat, nodes in get_patterns(code)]
for pat in attrs:
record += '\t{}'.format('T' if pat in code_pats else 'F')
print(record, file=f)
|