summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xtest-rules.py320
1 files changed, 173 insertions, 147 deletions
diff --git a/test-rules.py b/test-rules.py
index bb89e7e..8762cdd 100755
--- a/test-rules.py
+++ b/test-rules.py
@@ -13,16 +13,42 @@ from monkey.action import parse as parse_trace
from monkey.patterns import get_patterns
from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize
+# klass: T/F
+# condition: list of patterns
+# distribution: rule class distribution
+# quality: rule quality
+class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
+ def __str__(self):
+ s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
+ s += ''.join([str(pattern) + '\n' for pattern in self.condition])
+ return s
+
+# program: submitted code
+# correct: does this submission pass all tests?
+# patterns: patterns in this submission
+# hint: suggested hint
+class Submission(collections.namedtuple('Submission', ['program', 'correct', 'patterns', 'hint'])):
+ pass
+
+# ok: required patterns already in program (unused)
+# remove: patterns that should be removed
+# add: patterns that should be added (intersection from all relevant rules)
+# add_alternatives: patterns that should be added (union from all relevant rules)
+# NOTE currently either (only remove is set) or (both add and add_alternatives are set)
+class Hint(collections.namedtuple('Hint', ['ok', 'remove', 'add', 'add_alternatives'])):
+ pass
+
# script arguments
solutions_file = sys.argv[1]
-pid = int(sys.argv[2])
-data_dir = sys.argv[3]
+data_dir = sys.argv[2]
-attributes_file = os.path.join(data_dir, 'attributes')
-rules_file = os.path.join(data_dir, 'rules')
-users_file = os.path.join(data_dir, 'users-test')
+pid_file = os.path.join(data_dir, 'pid')
+attributes_file = os.path.join(data_dir, 'attributes.tab')
+rules_file = os.path.join(data_dir, 'rules.txt')
+users_file = os.path.join(data_dir, 'users-test.txt')
programs_file = os.path.join(data_dir, 'programs.pickle')
+pid = int(open(pid_file, 'r').read().strip())
# read test results for known programs
test = pickle.load(open(programs_file, 'rb'))
@@ -35,20 +61,15 @@ for solution in pickle.load(open(solutions_file, 'rb')):
# read attributes
attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])
-
-class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
- def __str__(self):
- s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
- s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
- return s
+attributes_ordered = [line.strip().split('\t')[1] for line in open(attributes_file, 'r').readlines()]
# read rules
rules = []
for line in open(rules_file, 'r').readlines():
- match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
+ match = re.match(r'IF ((?:a[0-9]*!=F(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
if match:
m = tuple(match.groups())
- condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
+ condition = tuple(attributes[field[:-3]] for field in m[0].split(' AND '))
rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
#print(rules[-1])
else:
@@ -65,161 +86,166 @@ def color_print(text, ranges):
i = start + length
print(text[i:])
-# check if given patterns match the rule
-def check_rule(rule, patterns):
- ret_patterns = []
- for rule_pattern, yes in rule.condition:
- if yes:
- # this pattern must be present
- for pattern, nodes in patterns:
- if pattern == rule_pattern:
- ret_patterns.append((rule_pattern, nodes))
- else:
- # this pattern must not be present
- if rule_pattern in [p[0] for p in patterns]:
- return []
- return ret_patterns
+# generate marks for selected patterns for color_print
+def mark(patterns, selected, color):
+ marks = set()
+ for pattern, nodes in patterns:
+ if pattern in selected:
+ marks |= set((n[0].pos, len(n[0].val), color) for n in nodes if n[0].pos)
+ return marks
+
+# return a hint for the best applicable buggy rule
+def suggest_buggy(rules, patterns):
+ for rule in [r for r in rules if r.klass == 'F']:
+ # suggest this rule if all patterns in condition are found in the program
+ if all(rule_pattern in [p[0] for p in patterns] for rule_pattern in rule.condition):
+ return Hint(ok=[], remove=rule.condition, add=[], add_alternatives=[])
+ return None
+
+# return a hint for the best applicable true rule
+def suggest_true(rules, patterns):
+ # get match info for all true rules
+ rule_matches = collections.defaultdict(list)
+ for rule in [r for r in rules if r.klass == 'T']:
+ found = set()
+ missing = set()
+ for rule_pattern in rule.condition:
+ if any(pattern == rule_pattern for pattern, nodes in patterns):
+ found.add(rule_pattern)
+ else:
+ missing.add(rule_pattern)
+ if missing:
+ rule_matches[len(found)].append((found, missing))
+
+ # return rules with most matching patterns
+ for i in range(10, 0, -1):
+ if i not in rule_matches:
+ continue
+ missing_patterns = collections.Counter()
+ for found, missing in rule_matches[i]:
+ for pattern in missing:
+ missing_patterns[pattern] += 1
+
+ best_missing_patterns = []
+ for missing_pattern, count in missing_patterns.most_common():
+ if count == missing_patterns.most_common()[0][1]:
+ best_missing_patterns.append(missing_pattern)
+ else:
+ break
-# keep track of when each suggestion was applied
-all_suggestions = []
-# programs with no matching rule
-unsuggestable = collections.Counter()
+ add = []
+ for pattern in attributes_ordered:
+ if pattern in best_missing_patterns:
+ add = [pattern]
+ break
+ add_alternatives = [pattern for pattern, _ in missing_patterns.most_common() if pattern not in add]
+ return Hint(ok=[], remove=[], add=add, add_alternatives=add_alternatives)
+
+ return None
+# evaluate hints on student traces
+submissions = collections.defaultdict(list)
for user, trace in traces.items():
- # get submissions from trace
- programs = []
+ # get submissions for this user
+ user_submissions = []
code = ''
for action in parse_trace(trace):
code = action.apply(code)
if action.type == 'test':
+ # skip syntactically incorrect submissions
if prolog_parse(code) is None:
continue
+
normalized_code = stringify(rename_vars_list(tokenize(code)))
- if programs and normalized_code == programs[-1][0]:
+ # skip repeated submissions
+ if user_submissions and normalized_code == user_submissions[-1].program:
+ continue
+ # skip submissions without cached test results
+ if normalized_code not in test:
continue
correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
- programs.append((normalized_code, correct))
- # ignore actions after first correct submission
+
+ # check rules for this submission
+ program_patterns = list(get_patterns(normalized_code))
+ hint = suggest_buggy(rules, program_patterns)
+ if not hint:
+ hint = suggest_true(rules, program_patterns)
+ user_submissions.append(Submission(normalized_code, correct, program_patterns, hint))
+
+ # skip submissions after the first correct program
if correct:
break
# ignore traces with no / only correct submissions
- if not any(p[1] for p in programs) or all(p[1] for p in programs):
+ if (not any(s.correct for s in user_submissions) or
+ all(s.correct for s in user_submissions)):
continue
- suggested = []
- for i, (program, correct) in enumerate(programs):
- program_patterns = list(get_patterns(program))
- #for p in program_patterns:
- # print(p[0])
- #print()
-
- # check if previously suggested rules match
- for s in suggested:
- s['passed'] += 1
- match = check_rule(s['rule'], program_patterns)
- if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
- s['rule'].klass == 'F' and not match):
- s['matched'].append(s['passed'])
-
- # only check programs until first correct submission
- if correct:
- print(str(i) + ' PASS\t' + program)
- print()
- break
-
- # check rules in order, buggy rules first
- found = False
- for rule in (
- [r for r in rules if r.klass == 'F'] +
- [r for r in rules if r.klass == 'T']):
- match = check_rule(rule, program_patterns)
- if (rule.klass == 'F' and not match or
- rule.klass == 'T' and len(match) != len(rule.condition)-1):
- continue
- found = True
-
- # store suggestion to see if it was implemented later
- if not any(s['program'] == program and s['rule'] == rule for s in suggested):
- # passed: how many submission before PASS
- # matched: list of submissions where the suggested rule matched
- # (the current submission has index 0, the next 1 and so on)
- suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})
-
- # get highlights
- highlight = set()
- for m in match:
- for n in m[1]:
- highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))
-
- # print highighted program
- print(str(i) + ' FAIL', end='\t')
- color_print(program, list(highlight))
-
- # print rule
- for rule_pattern, yes in rule.condition:
- if rule.klass == 'T':
- if rule_pattern in [pattern for pattern, nodes in program_patterns]:
- print('good\t' + str(rule_pattern))
- else:
- print('missing\t' + str(rule_pattern))
- else:
- if rule_pattern in [pattern for pattern, nodes in program_patterns]:
- print('buggy\t' + str(rule_pattern))
- print()
- break
-
- if not found:
- print(str(i) + ' FAIL\t' + str(program))
- print()
- unsuggestable[program] += 1
-
- print('Suggestions and versions in which they were implemented:')
- for s in suggested:
- index = len(programs) - (s['passed'] + 1)
- print(index, [index + m for m in s['matched']])
- all_suggestions += suggested
-
+ submissions[user] = user_submissions
+
+ # print submissions with hints for debugging
+ for s in user_submissions:
+ print('PASS' if s.correct else 'FAIL', end='\t')
+ marks = []
+ if s.hint and s.hint.remove:
+ marks = mark(s.patterns, s.hint.remove, 'red')
+ color_print(s.program, marks)
+
+ if s.hint:
+ for x in s.hint.remove:
+ print('buggy\t', x)
+ for x in s.hint.add:
+ print('missing\t', x)
+ for x in s.hint.add_alternatives:
+ print('alternative\t', x)
+ print()
print('-'*30)
print()
-# report
-not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
-matched = [s for s in all_suggestions if s['passed'] in s['matched']]
-
-# rules that did / did not match in the solution
-good = collections.Counter()
-bad = collections.Counter()
-for s in all_suggestions:
- (good if s in matched else bad)[s['rule']] += 1
+# submissions where hint pattern was implemented in the solution
+good_hint = []
+# submissions where one of the alternative hint patterns was implemented in the solution
+medium_hint = []
+# submissions where none of the hint patterns were implemented in the solution
+bad_hint = []
+# submissions with no suggestions
+no_hint = []
+
+# total number of submissions
+n_subs = 0
+for user, subs in submissions.items():
+ solution = subs[-1]
+ solution_patterns = [p[0] for p in solution.patterns]
+ for s in subs[:-1]:
+ n_subs += 1
+ if s.hint:
+ if s.hint.remove:
+ # buggy rule: at least one pattern should not be present in solution
+ if any(pattern not in solution_patterns for pattern in s.hint.remove):
+ good_hint.append(s)
+ else:
+ bad_hint.append(s)
+ else:
+ # true rule: all patterns should be present in solution
+ if all(pattern in solution_patterns for pattern in s.hint.add):
+ # best suggested pattern(s) match
+ good_hint.append(s)
+ elif any(pattern in solution_patterns for pattern in s.hint.add_alternatives):
+ # some suggested pattern(s) match
+ medium_hint.append(s)
+ else:
+ bad_hint.append(s)
+ else:
+ no_hint.append(s)
print('Statistics')
print('----------')
-print('# of suggestions that were implemented:', len(matched))
-print('# of suggestions that were not implemented:', len(not_matched))
-print('avg. # of submissions before suggestion was implemented:',
- sum(s['matched'][0] for s in matched)/len(matched))
-print('avg. # of submissions until PASS after suggestion was implemented:',
- sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
-print('avg. # of submissions until PASS if suggestion was not implemented:',
- sum(s['passed'] for s in not_matched)/len(not_matched))
-#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
-print()
-
-print('Unsuggestable programs')
-print('----------------------')
-for p, count in unsuggestable.most_common():
- print('{}\t{}'.format(count, p))
-print()
-
-print('Good rules')
-print('----------')
-for r, count in good.most_common():
- print('Suggested for ' + str(count) + ' submissions')
- print(r)
-
-print('Bad rules')
-print('---------')
-for r, count in bad.most_common():
- print('Suggested for ' + str(count) + ' submissions')
- print(r)
+print('avg. submissions per trace:', mean(len(subs) for subs in submissions.values()))
+print('avg. clauses in solution:', mean(subs[-1].program.count('.') for subs in submissions.values()))
+print('total submissions:', n_subs)
+print('positive hints (best implemented):', len([s for s in good_hint if s.hint.add]))
+print('positive hints (alternative implemented):', len([s for s in medium_hint if s.hint.add_alternatives]))
+print('positive hints (not implemented):', len([s for s in bad_hint if s.hint.add]))
+print('buggy hints (implemented):', len([s for s in good_hint if s.hint.remove]))
+print('buggy hints (not implemented):', len([s for s in bad_hint if s.hint.remove]))
+print('no hints:', len(no_hint))