test-rules.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

#!/usr/bin/python3

import collections
import os.path
import pickle
import re
from statistics import mean
import sys

from termcolor import colored

from monkey.action import parse as parse_trace
from monkey.patterns import get_patterns
from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize

# script arguments
solutions_file = sys.argv[1]
pid = int(sys.argv[2])
data_dir = sys.argv[3]

attributes_file = os.path.join(data_dir, 'attributes')
rules_file = os.path.join(data_dir, 'rules')
users_file = os.path.join(data_dir, 'users-test')
programs_file = os.path.join(data_dir, 'programs.pickle')

# read test results for known programs
test = pickle.load(open(programs_file, 'rb'))

# read traces
users = [int(line.strip()) for line in open(users_file, 'r').readlines()]
traces = {}
for solution in pickle.load(open(solutions_file, 'rb')):
    if solution.problem_id == pid and solution.codeq_user_id in users:
        traces[solution.codeq_user_id] = solution.trace

# read attributes
attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])

class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
    def __str__(self):
        s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
        s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
        return s

# read rules
rules = []
for line in open(rules_file, 'r').readlines():
    match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
    if match:
        m = tuple(match.groups())
        condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
        rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
        #print(rules[-1])
    else:
        print('Did not understand rule:', line.strip())

def color_print(text, ranges):
    i = 0
    for start, length, color in sorted(ranges):
        # ignore overlapping ranges
        if start < i:
            continue
        print(text[i:start], end='')
        print(colored(text[start:start+length], color), end='')
        i = start + length
    print(text[i:])

# check if given patterns match the rule
def check_rule(rule, patterns):
    ret_patterns = []
    for rule_pattern, yes in rule.condition:
        if yes:
            # this pattern must be present
            for pattern, nodes in patterns:
                if pattern == rule_pattern:
                    ret_patterns.append((rule_pattern, nodes))
        else:
            # this pattern must not be present
            if rule_pattern in [p[0] for p in patterns]:
                return []
    return ret_patterns

# keep track of when each suggestion was applied
all_suggestions = []
# programs with no matching rule
unsuggestable = collections.Counter()

for user, trace in traces.items():
    # get submissions from trace
    programs = []
    code = ''
    for action in parse_trace(trace):
        code = action.apply(code)
        if action.type == 'test':
            if prolog_parse(code) is None:
                continue
            normalized_code = stringify(rename_vars_list(tokenize(code)))
            if programs and normalized_code == programs[-1][0]:
                continue
            correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
            programs.append((normalized_code, correct))
            # ignore actions after first correct submission
            if correct:
                break

    # ignore traces with no / only correct submissions
    if not any(p[1] for p in programs) or all(p[1] for p in programs):
        continue

    suggested = []
    for i, (program, correct) in enumerate(programs):
        program_patterns = list(get_patterns(program))
        #for p in program_patterns:
        #    print(p[0])
        #print()

        # check if previously suggested rules match
        for s in suggested:
            s['passed'] += 1
            match = check_rule(s['rule'], program_patterns)
            if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
                s['rule'].klass == 'F' and not match):
                s['matched'].append(s['passed'])

        # only check programs until first correct submission
        if correct:
            print(str(i) + ' PASS\t' + program)
            print()
            break

        # check rules in order, buggy rules first
        found = False
        for rule in (
                [r for r in rules if r.klass == 'F'] +
                [r for r in rules if r.klass == 'T']):
            match = check_rule(rule, program_patterns)
            if (rule.klass == 'F' and not match or
                rule.klass == 'T' and len(match) != len(rule.condition)-1):
                continue
            found = True

            # store suggestion to see if it was implemented later
            if not any(s['program'] == program and s['rule'] == rule for s in suggested):
                # passed: how many submission before PASS
                # matched: list of submissions where the suggested rule matched
                #          (the current submission has index 0, the next 1 and so on)
                suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})

            # get highlights
            highlight = set()
            for m in match:
                for n in m[1]:
                    highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))

            # print highighted program
            print(str(i) + ' FAIL', end='\t')
            color_print(program, list(highlight))

            # print rule
            for rule_pattern, yes in rule.condition:
                if rule.klass == 'T':
                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
                        print('good\t' + str(rule_pattern))
                    else:
                        print('missing\t' + str(rule_pattern))
                else:
                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
                        print('buggy\t' + str(rule_pattern))
            print()
            break

        if not found:
            print(str(i) + ' FAIL\t' + str(program))
            print()
            unsuggestable[program] += 1

    print('Suggestions and versions in which they were implemented:')
    for s in suggested:
        index = len(programs) - (s['passed'] + 1)
        print(index, [index + m for m in s['matched']])
    all_suggestions += suggested

    print('-'*30)
    print()

# report
not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
matched = [s for s in all_suggestions if s['passed'] in s['matched']]

# rules that did / did not match in the solution
good = collections.Counter()
bad = collections.Counter()
for s in all_suggestions:
    (good if s in matched else bad)[s['rule']] += 1

print('Statistics')
print('----------')
print('# of suggestions that were implemented:', len(matched))
print('# of suggestions that were not implemented:', len(not_matched))
print('avg. # of submissions before suggestion was implemented:',
        sum(s['matched'][0] for s in matched)/len(matched))
print('avg. # of submissions until PASS after suggestion was implemented:',
        sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
print('avg. # of submissions until PASS if suggestion was not implemented:',
        sum(s['passed'] for s in not_matched)/len(not_matched))
#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
print()

print('Unsuggestable programs')
print('----------------------')
for p, count in unsuggestable.most_common():
    print('{}\t{}'.format(count, p))
print()

print('Good rules')
print('----------')
for r, count in good.most_common():
    print('Suggested for ' + str(count) + ' submissions')
    print(r)

print('Bad rules')
print('---------')
for r, count in bad.most_common():
    print('Suggested for ' + str(count) + ' submissions')
    print(r)