1 files changed, 173 insertions, 147 deletions
diff --git a/test-rules.py b/test-rules.py
index bb89e7e..8762cdd 100755
--- a/test-rules.py
+++ b/test-rules.py
@@ -13,16 +13,42 @@ from monkey.action import parse as parse_trace
 from monkey.patterns import get_patterns
 from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize
 
+# klass: T/F
+# condition: list of patterns
+# distribution: rule class distribution
+# quality: rule quality
+class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
+    def __str__(self):
+        s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
+        s += ''.join([str(pattern) + '\n' for pattern in self.condition])
+        return s
+
+# program: submitted code
+# correct: does this submission pass all tests?
+# patterns: patterns in this submission
+# hint: suggested hint
+class Submission(collections.namedtuple('Submission', ['program', 'correct', 'patterns', 'hint'])):
+    pass
+
+# ok: required patterns already in program (unused)
+# remove: patterns that should be removed
+# add: patterns that should be added (intersection from all relevant rules)
+# add_alternatives: patterns that should be added (union from all relevant rules)
+# NOTE currently either (only remove is set) or (both add and add_alternatives are set)
+class Hint(collections.namedtuple('Hint', ['ok', 'remove', 'add', 'add_alternatives'])):
+    pass
+
 # script arguments
 solutions_file = sys.argv[1]
-pid = int(sys.argv[2])
-data_dir = sys.argv[3]
+data_dir = sys.argv[2]
 
-attributes_file = os.path.join(data_dir, 'attributes')
-rules_file = os.path.join(data_dir, 'rules')
-users_file = os.path.join(data_dir, 'users-test')
+pid_file = os.path.join(data_dir, 'pid')
+attributes_file = os.path.join(data_dir, 'attributes.tab')
+rules_file = os.path.join(data_dir, 'rules.txt')
+users_file = os.path.join(data_dir, 'users-test.txt')
 programs_file = os.path.join(data_dir, 'programs.pickle')
 
+pid = int(open(pid_file, 'r').read().strip())
 # read test results for known programs
 test = pickle.load(open(programs_file, 'rb'))
 
@@ -35,20 +61,15 @@ for solution in pickle.load(open(solutions_file, 'rb')):
 
 # read attributes
 attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])
-
-class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
-    def __str__(self):
-        s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
-        s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
-        return s
+attributes_ordered = [line.strip().split('\t')[1] for line in open(attributes_file, 'r').readlines()]
 
 # read rules
 rules = []
 for line in open(rules_file, 'r').readlines():
-    match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
+    match = re.match(r'IF ((?:a[0-9]*!=F(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
     if match:
         m = tuple(match.groups())
-        condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
+        condition = tuple(attributes[field[:-3]] for field in m[0].split(' AND '))
         rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
         #print(rules[-1])
     else:
@@ -65,161 +86,166 @@ def color_print(text, ranges):
         i = start + length
     print(text[i:])
 
-# check if given patterns match the rule
-def check_rule(rule, patterns):
-    ret_patterns = []
-    for rule_pattern, yes in rule.condition:
-        if yes:
-            # this pattern must be present
-            for pattern, nodes in patterns:
-                if pattern == rule_pattern:
-                    ret_patterns.append((rule_pattern, nodes))
-        else:
-            # this pattern must not be present
-            if rule_pattern in [p[0] for p in patterns]:
-                return []
-    return ret_patterns
+# generate marks for selected patterns for color_print
+def mark(patterns, selected, color):
+    marks = set()
+    for pattern, nodes in patterns:
+        if pattern in selected:
+            marks |= set((n[0].pos, len(n[0].val), color) for n in nodes if n[0].pos)
+    return marks
+
+# return a hint for the best applicable buggy rule
+def suggest_buggy(rules, patterns):
+    for rule in [r for r in rules if r.klass == 'F']:
+        # suggest this rule if all patterns in condition are found in the program
+        if all(rule_pattern in [p[0] for p in patterns] for rule_pattern in rule.condition):
+            return Hint(ok=[], remove=rule.condition, add=[], add_alternatives=[])
+    return None
+
+# return a hint for the best applicable true rule
+def suggest_true(rules, patterns):
+    # get match info for all true rules
+    rule_matches = collections.defaultdict(list)
+    for rule in [r for r in rules if r.klass == 'T']:
+        found = set()
+        missing = set()
+        for rule_pattern in rule.condition:
+            if any(pattern == rule_pattern for pattern, nodes in patterns):
+                found.add(rule_pattern)
+            else:
+                missing.add(rule_pattern)
+        if missing:
+            rule_matches[len(found)].append((found, missing))
+
+    # return rules with most matching patterns
+    for i in range(10, 0, -1):
+        if i not in rule_matches:
+            continue
+        missing_patterns = collections.Counter()
+        for found, missing in rule_matches[i]:
+            for pattern in missing:
+                missing_patterns[pattern] += 1
+
+        best_missing_patterns = []
+        for missing_pattern, count in missing_patterns.most_common():
+            if count == missing_patterns.most_common()[0][1]:
+                best_missing_patterns.append(missing_pattern)
+            else:
+                break
 
-# keep track of when each suggestion was applied
-all_suggestions = []
-# programs with no matching rule
-unsuggestable = collections.Counter()
+        add = []
+        for pattern in attributes_ordered:
+            if pattern in best_missing_patterns:
+                add = [pattern]
+                break
+        add_alternatives = [pattern for pattern, _ in missing_patterns.most_common() if pattern not in add]
+        return Hint(ok=[], remove=[], add=add, add_alternatives=add_alternatives)
+
+    return None
 
+# evaluate hints on student traces
+submissions = collections.defaultdict(list)
 for user, trace in traces.items():
-    # get submissions from trace
-    programs = []
+    # get submissions for this user
+    user_submissions = []
     code = ''
     for action in parse_trace(trace):
         code = action.apply(code)
         if action.type == 'test':
+            # skip syntactically incorrect submissions
             if prolog_parse(code) is None:
                 continue
+
             normalized_code = stringify(rename_vars_list(tokenize(code)))
-            if programs and normalized_code == programs[-1][0]:
+            # skip repeated submissions
+            if user_submissions and normalized_code == user_submissions[-1].program:
+                continue
+            # skip submissions without cached test results
+            if normalized_code not in test:
                 continue
             correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
-            programs.append((normalized_code, correct))
-            # ignore actions after first correct submission
+
+            # check rules for this submission
+            program_patterns = list(get_patterns(normalized_code))
+            hint = suggest_buggy(rules, program_patterns)
+            if not hint:
+                hint = suggest_true(rules, program_patterns)
+            user_submissions.append(Submission(normalized_code, correct, program_patterns, hint))
+
+            # skip submissions after the first correct program
             if correct:
                 break
 
     # ignore traces with no / only correct submissions
-    if not any(p[1] for p in programs) or all(p[1] for p in programs):
+    if (not any(s.correct for s in user_submissions) or
+        all(s.correct for s in user_submissions)):
         continue
 
-    suggested = []
-    for i, (program, correct) in enumerate(programs):
-        program_patterns = list(get_patterns(program))
-        #for p in program_patterns:
-        #    print(p[0])
-        #print()
-
-        # check if previously suggested rules match
-        for s in suggested:
-            s['passed'] += 1
-            match = check_rule(s['rule'], program_patterns)
-            if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
-                s['rule'].klass == 'F' and not match):
-                s['matched'].append(s['passed'])
-
-        # only check programs until first correct submission
-        if correct:
-            print(str(i) + ' PASS\t' + program)
-            print()
-            break
-
-        # check rules in order, buggy rules first
-        found = False
-        for rule in (
-                [r for r in rules if r.klass == 'F'] +
-                [r for r in rules if r.klass == 'T']):
-            match = check_rule(rule, program_patterns)
-            if (rule.klass == 'F' and not match or
-                rule.klass == 'T' and len(match) != len(rule.condition)-1):
-                continue
-            found = True
-
-            # store suggestion to see if it was implemented later
-            if not any(s['program'] == program and s['rule'] == rule for s in suggested):
-                # passed: how many submission before PASS
-                # matched: list of submissions where the suggested rule matched
-                #          (the current submission has index 0, the next 1 and so on)
-                suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})
-
-            # get highlights
-            highlight = set()
-            for m in match:
-                for n in m[1]:
-                    highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))
-
-            # print highighted program
-            print(str(i) + ' FAIL', end='\t')
-            color_print(program, list(highlight))
-
-            # print rule
-            for rule_pattern, yes in rule.condition:
-                if rule.klass == 'T':
-                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
-                        print('good\t' + str(rule_pattern))
-                    else:
-                        print('missing\t' + str(rule_pattern))
-                else:
-                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
-                        print('buggy\t' + str(rule_pattern))
-            print()
-            break
-
-        if not found:
-            print(str(i) + ' FAIL\t' + str(program))
-            print()
-            unsuggestable[program] += 1
-
-    print('Suggestions and versions in which they were implemented:')
-    for s in suggested:
-        index = len(programs) - (s['passed'] + 1)
-        print(index, [index + m for m in s['matched']])
-    all_suggestions += suggested
-
+    submissions[user] = user_submissions
+
+    # print submissions with hints for debugging
+    for s in user_submissions:
+        print('PASS' if s.correct else 'FAIL', end='\t')
+        marks = []
+        if s.hint and s.hint.remove:
+            marks = mark(s.patterns, s.hint.remove, 'red')
+        color_print(s.program, marks)
+
+        if s.hint:
+            for x in s.hint.remove:
+                print('buggy\t', x)
+            for x in s.hint.add:
+                print('missing\t', x)
+            for x in s.hint.add_alternatives:
+                print('alternative\t', x)
+        print()
     print('-'*30)
     print()
 
-# report
-not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
-matched = [s for s in all_suggestions if s['passed'] in s['matched']]
-
-# rules that did / did not match in the solution
-good = collections.Counter()
-bad = collections.Counter()
-for s in all_suggestions:
-    (good if s in matched else bad)[s['rule']] += 1
+# submissions where hint pattern was implemented in the solution
+good_hint = []
+# submissions where one of the alternative hint patterns was implemented in the solution
+medium_hint = []
+# submissions where none of the hint patterns were implemented in the solution
+bad_hint = []
+# submissions with no suggestions
+no_hint = []
+
+# total number of submissions
+n_subs = 0
+for user, subs in submissions.items():
+    solution = subs[-1]
+    solution_patterns = [p[0] for p in solution.patterns]
+    for s in subs[:-1]:
+        n_subs += 1
+        if s.hint:
+            if s.hint.remove:
+                # buggy rule: at least one pattern should not be present in solution
+                if any(pattern not in solution_patterns for pattern in s.hint.remove):
+                    good_hint.append(s)
+                else:
+                    bad_hint.append(s)
+            else:
+                # true rule: all patterns should be present in solution
+                if all(pattern in solution_patterns for pattern in s.hint.add):
+                    # best suggested pattern(s) match
+                    good_hint.append(s)
+                elif any(pattern in solution_patterns for pattern in s.hint.add_alternatives):
+                    # some suggested pattern(s) match
+                    medium_hint.append(s)
+                else:
+                    bad_hint.append(s)
+        else:
+            no_hint.append(s)
 
 print('Statistics')
 print('----------')
-print('# of suggestions that were implemented:', len(matched))
-print('# of suggestions that were not implemented:', len(not_matched))
-print('avg. # of submissions before suggestion was implemented:',
-        sum(s['matched'][0] for s in matched)/len(matched))
-print('avg. # of submissions until PASS after suggestion was implemented:',
-        sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
-print('avg. # of submissions until PASS if suggestion was not implemented:',
-        sum(s['passed'] for s in not_matched)/len(not_matched))
-#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
-print()
-
-print('Unsuggestable programs')
-print('----------------------')
-for p, count in unsuggestable.most_common():
-    print('{}\t{}'.format(count, p))
-print()
-
-print('Good rules')
-print('----------')
-for r, count in good.most_common():
-    print('Suggested for ' + str(count) + ' submissions')
-    print(r)
-
-print('Bad rules')
-print('---------')
-for r, count in bad.most_common():
-    print('Suggested for ' + str(count) + ' submissions')
-    print(r)
+print('avg. submissions per trace:', mean(len(subs) for subs in submissions.values()))
+print('avg. clauses in solution:', mean(subs[-1].program.count('.') for subs in submissions.values()))
+print('total submissions:', n_subs)
+print('positive hints (best implemented):', len([s for s in good_hint if s.hint.add]))
+print('positive hints (alternative implemented):', len([s for s in medium_hint if s.hint.add_alternatives]))
+print('positive hints (not implemented):', len([s for s in bad_hint if s.hint.add]))
+print('buggy hints (implemented):', len([s for s in good_hint if s.hint.remove]))
+print('buggy hints (not implemented):', len([s for s in bad_hint if s.hint.remove]))
+print('no hints:', len(no_hint))