summaryrefslogtreecommitdiff
path: root/test-rules.py
diff options
context:
space:
mode:
Diffstat (limited to 'test-rules.py')
-rwxr-xr-xtest-rules.py225
1 files changed, 225 insertions, 0 deletions
diff --git a/test-rules.py b/test-rules.py
new file mode 100755
index 0000000..bb89e7e
--- /dev/null
+++ b/test-rules.py
@@ -0,0 +1,225 @@
+#!/usr/bin/python3
+
+import collections
+import os.path
+import pickle
+import re
+from statistics import mean
+import sys
+
+from termcolor import colored
+
+from monkey.action import parse as parse_trace
+from monkey.patterns import get_patterns
+from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize
+
+# script arguments
+solutions_file = sys.argv[1]
+pid = int(sys.argv[2])
+data_dir = sys.argv[3]
+
+attributes_file = os.path.join(data_dir, 'attributes')
+rules_file = os.path.join(data_dir, 'rules')
+users_file = os.path.join(data_dir, 'users-test')
+programs_file = os.path.join(data_dir, 'programs.pickle')
+
+# read test results for known programs
+test = pickle.load(open(programs_file, 'rb'))
+
+# read traces
+users = [int(line.strip()) for line in open(users_file, 'r').readlines()]
+traces = {}
+for solution in pickle.load(open(solutions_file, 'rb')):
+ if solution.problem_id == pid and solution.codeq_user_id in users:
+ traces[solution.codeq_user_id] = solution.trace
+
+# read attributes
+attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])
+
+class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
+ def __str__(self):
+ s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
+ s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
+ return s
+
+# read rules
+rules = []
+for line in open(rules_file, 'r').readlines():
+ match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
+ if match:
+ m = tuple(match.groups())
+ condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
+ rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
+ #print(rules[-1])
+ else:
+ print('Did not understand rule:', line.strip())
+
+def color_print(text, ranges):
+ i = 0
+ for start, length, color in sorted(ranges):
+ # ignore overlapping ranges
+ if start < i:
+ continue
+ print(text[i:start], end='')
+ print(colored(text[start:start+length], color), end='')
+ i = start + length
+ print(text[i:])
+
+# check if given patterns match the rule
+def check_rule(rule, patterns):
+ ret_patterns = []
+ for rule_pattern, yes in rule.condition:
+ if yes:
+ # this pattern must be present
+ for pattern, nodes in patterns:
+ if pattern == rule_pattern:
+ ret_patterns.append((rule_pattern, nodes))
+ else:
+ # this pattern must not be present
+ if rule_pattern in [p[0] for p in patterns]:
+ return []
+ return ret_patterns
+
+# keep track of when each suggestion was applied
+all_suggestions = []
+# programs with no matching rule
+unsuggestable = collections.Counter()
+
+for user, trace in traces.items():
+ # get submissions from trace
+ programs = []
+ code = ''
+ for action in parse_trace(trace):
+ code = action.apply(code)
+ if action.type == 'test':
+ if prolog_parse(code) is None:
+ continue
+ normalized_code = stringify(rename_vars_list(tokenize(code)))
+ if programs and normalized_code == programs[-1][0]:
+ continue
+ correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
+ programs.append((normalized_code, correct))
+ # ignore actions after first correct submission
+ if correct:
+ break
+
+ # ignore traces with no / only correct submissions
+ if not any(p[1] for p in programs) or all(p[1] for p in programs):
+ continue
+
+ suggested = []
+ for i, (program, correct) in enumerate(programs):
+ program_patterns = list(get_patterns(program))
+ #for p in program_patterns:
+ # print(p[0])
+ #print()
+
+ # check if previously suggested rules match
+ for s in suggested:
+ s['passed'] += 1
+ match = check_rule(s['rule'], program_patterns)
+ if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
+ s['rule'].klass == 'F' and not match):
+ s['matched'].append(s['passed'])
+
+ # only check programs until first correct submission
+ if correct:
+ print(str(i) + ' PASS\t' + program)
+ print()
+ break
+
+ # check rules in order, buggy rules first
+ found = False
+ for rule in (
+ [r for r in rules if r.klass == 'F'] +
+ [r for r in rules if r.klass == 'T']):
+ match = check_rule(rule, program_patterns)
+ if (rule.klass == 'F' and not match or
+ rule.klass == 'T' and len(match) != len(rule.condition)-1):
+ continue
+ found = True
+
+ # store suggestion to see if it was implemented later
+ if not any(s['program'] == program and s['rule'] == rule for s in suggested):
+ # passed: how many submission before PASS
+ # matched: list of submissions where the suggested rule matched
+ # (the current submission has index 0, the next 1 and so on)
+ suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})
+
+ # get highlights
+ highlight = set()
+ for m in match:
+ for n in m[1]:
+ highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))
+
+ # print highighted program
+ print(str(i) + ' FAIL', end='\t')
+ color_print(program, list(highlight))
+
+ # print rule
+ for rule_pattern, yes in rule.condition:
+ if rule.klass == 'T':
+ if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+ print('good\t' + str(rule_pattern))
+ else:
+ print('missing\t' + str(rule_pattern))
+ else:
+ if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+ print('buggy\t' + str(rule_pattern))
+ print()
+ break
+
+ if not found:
+ print(str(i) + ' FAIL\t' + str(program))
+ print()
+ unsuggestable[program] += 1
+
+ print('Suggestions and versions in which they were implemented:')
+ for s in suggested:
+ index = len(programs) - (s['passed'] + 1)
+ print(index, [index + m for m in s['matched']])
+ all_suggestions += suggested
+
+ print('-'*30)
+ print()
+
+# report
+not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
+matched = [s for s in all_suggestions if s['passed'] in s['matched']]
+
+# rules that did / did not match in the solution
+good = collections.Counter()
+bad = collections.Counter()
+for s in all_suggestions:
+ (good if s in matched else bad)[s['rule']] += 1
+
+print('Statistics')
+print('----------')
+print('# of suggestions that were implemented:', len(matched))
+print('# of suggestions that were not implemented:', len(not_matched))
+print('avg. # of submissions before suggestion was implemented:',
+ sum(s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS after suggestion was implemented:',
+ sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS if suggestion was not implemented:',
+ sum(s['passed'] for s in not_matched)/len(not_matched))
+#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
+print()
+
+print('Unsuggestable programs')
+print('----------------------')
+for p, count in unsuggestable.most_common():
+ print('{}\t{}'.format(count, p))
+print()
+
+print('Good rules')
+print('----------')
+for r, count in good.most_common():
+ print('Suggested for ' + str(count) + ' submissions')
+ print(r)
+
+print('Bad rules')
+print('---------')
+for r, count in bad.most_common():
+ print('Suggested for ' + str(count) + ' submissions')
+ print(r)