1 files changed, 225 insertions, 0 deletions
diff --git a/test-rules.py b/test-rules.py
new file mode 100755
index 0000000..bb89e7e
--- /dev/null
+++ b/test-rules.py
@@ -0,0 +1,225 @@
+#!/usr/bin/python3
+
+import collections
+import os.path
+import pickle
+import re
+from statistics import mean
+import sys
+
+from termcolor import colored
+
+from monkey.action import parse as parse_trace
+from monkey.patterns import get_patterns
+from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize
+
+# script arguments
+solutions_file = sys.argv[1]
+pid = int(sys.argv[2])
+data_dir = sys.argv[3]
+
+attributes_file = os.path.join(data_dir, 'attributes')
+rules_file = os.path.join(data_dir, 'rules')
+users_file = os.path.join(data_dir, 'users-test')
+programs_file = os.path.join(data_dir, 'programs.pickle')
+
+# read test results for known programs
+test = pickle.load(open(programs_file, 'rb'))
+
+# read traces
+users = [int(line.strip()) for line in open(users_file, 'r').readlines()]
+traces = {}
+for solution in pickle.load(open(solutions_file, 'rb')):
+    if solution.problem_id == pid and solution.codeq_user_id in users:
+        traces[solution.codeq_user_id] = solution.trace
+
+# read attributes
+attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])
+
+class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
+    def __str__(self):
+        s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
+        s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
+        return s
+
+# read rules
+rules = []
+for line in open(rules_file, 'r').readlines():
+    match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
+    if match:
+        m = tuple(match.groups())
+        condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
+        rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
+        #print(rules[-1])
+    else:
+        print('Did not understand rule:', line.strip())
+
+def color_print(text, ranges):
+    i = 0
+    for start, length, color in sorted(ranges):
+        # ignore overlapping ranges
+        if start < i:
+            continue
+        print(text[i:start], end='')
+        print(colored(text[start:start+length], color), end='')
+        i = start + length
+    print(text[i:])
+
+# check if given patterns match the rule
+def check_rule(rule, patterns):
+    ret_patterns = []
+    for rule_pattern, yes in rule.condition:
+        if yes:
+            # this pattern must be present
+            for pattern, nodes in patterns:
+                if pattern == rule_pattern:
+                    ret_patterns.append((rule_pattern, nodes))
+        else:
+            # this pattern must not be present
+            if rule_pattern in [p[0] for p in patterns]:
+                return []
+    return ret_patterns
+
+# keep track of when each suggestion was applied
+all_suggestions = []
+# programs with no matching rule
+unsuggestable = collections.Counter()
+
+for user, trace in traces.items():
+    # get submissions from trace
+    programs = []
+    code = ''
+    for action in parse_trace(trace):
+        code = action.apply(code)
+        if action.type == 'test':
+            if prolog_parse(code) is None:
+                continue
+            normalized_code = stringify(rename_vars_list(tokenize(code)))
+            if programs and normalized_code == programs[-1][0]:
+                continue
+            correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
+            programs.append((normalized_code, correct))
+            # ignore actions after first correct submission
+            if correct:
+                break
+
+    # ignore traces with no / only correct submissions
+    if not any(p[1] for p in programs) or all(p[1] for p in programs):
+        continue
+
+    suggested = []
+    for i, (program, correct) in enumerate(programs):
+        program_patterns = list(get_patterns(program))
+        #for p in program_patterns:
+        #    print(p[0])
+        #print()
+
+        # check if previously suggested rules match
+        for s in suggested:
+            s['passed'] += 1
+            match = check_rule(s['rule'], program_patterns)
+            if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
+                s['rule'].klass == 'F' and not match):
+                s['matched'].append(s['passed'])
+
+        # only check programs until first correct submission
+        if correct:
+            print(str(i) + ' PASS\t' + program)
+            print()
+            break
+
+        # check rules in order, buggy rules first
+        found = False
+        for rule in (
+                [r for r in rules if r.klass == 'F'] +
+                [r for r in rules if r.klass == 'T']):
+            match = check_rule(rule, program_patterns)
+            if (rule.klass == 'F' and not match or
+                rule.klass == 'T' and len(match) != len(rule.condition)-1):
+                continue
+            found = True
+
+            # store suggestion to see if it was implemented later
+            if not any(s['program'] == program and s['rule'] == rule for s in suggested):
+                # passed: how many submission before PASS
+                # matched: list of submissions where the suggested rule matched
+                #          (the current submission has index 0, the next 1 and so on)
+                suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})
+
+            # get highlights
+            highlight = set()
+            for m in match:
+                for n in m[1]:
+                    highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))
+
+            # print highighted program
+            print(str(i) + ' FAIL', end='\t')
+            color_print(program, list(highlight))
+
+            # print rule
+            for rule_pattern, yes in rule.condition:
+                if rule.klass == 'T':
+                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+                        print('good\t' + str(rule_pattern))
+                    else:
+                        print('missing\t' + str(rule_pattern))
+                else:
+                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+                        print('buggy\t' + str(rule_pattern))
+            print()
+            break
+
+        if not found:
+            print(str(i) + ' FAIL\t' + str(program))
+            print()
+            unsuggestable[program] += 1
+
+    print('Suggestions and versions in which they were implemented:')
+    for s in suggested:
+        index = len(programs) - (s['passed'] + 1)
+        print(index, [index + m for m in s['matched']])
+    all_suggestions += suggested
+
+    print('-'*30)
+    print()
+
+# report
+not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
+matched = [s for s in all_suggestions if s['passed'] in s['matched']]
+
+# rules that did / did not match in the solution
+good = collections.Counter()
+bad = collections.Counter()
+for s in all_suggestions:
+    (good if s in matched else bad)[s['rule']] += 1
+
+print('Statistics')
+print('----------')
+print('# of suggestions that were implemented:', len(matched))
+print('# of suggestions that were not implemented:', len(not_matched))
+print('avg. # of submissions before suggestion was implemented:',
+        sum(s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS after suggestion was implemented:',
+        sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS if suggestion was not implemented:',
+        sum(s['passed'] for s in not_matched)/len(not_matched))
+#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
+print()
+
+print('Unsuggestable programs')
+print('----------------------')
+for p, count in unsuggestable.most_common():
+    print('{}\t{}'.format(count, p))
+print()
+
+print('Good rules')
+print('----------')
+for r, count in good.most_common():
+    print('Suggested for ' + str(count) + ' submissions')
+    print(r)
+
+print('Bad rules')
+print('---------')
+for r, count in bad.most_common():
+    print('Suggested for ' + str(count) + ' submissions')
+    print(r)