diff options
-rwxr-xr-x | test-rules.py | 320 |
1 files changed, 173 insertions, 147 deletions
diff --git a/test-rules.py b/test-rules.py index bb89e7e..8762cdd 100755 --- a/test-rules.py +++ b/test-rules.py @@ -13,16 +13,42 @@ from monkey.action import parse as parse_trace from monkey.patterns import get_patterns from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize +# klass: T/F +# condition: list of patterns +# distribution: rule class distribution +# quality: rule quality +class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])): + def __str__(self): + s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality) + s += ''.join([str(pattern) + '\n' for pattern in self.condition]) + return s + +# program: submitted code +# correct: does this submission pass all tests? +# patterns: patterns in this submission +# hint: suggested hint +class Submission(collections.namedtuple('Submission', ['program', 'correct', 'patterns', 'hint'])): + pass + +# ok: required patterns already in program (unused) +# remove: patterns that should be removed +# add: patterns that should be added (intersection from all relevant rules) +# add_alternatives: patterns that should be added (union from all relevant rules) +# NOTE currently either (only remove is set) or (both add and add_alternatives are set) +class Hint(collections.namedtuple('Hint', ['ok', 'remove', 'add', 'add_alternatives'])): + pass + # script arguments solutions_file = sys.argv[1] -pid = int(sys.argv[2]) -data_dir = sys.argv[3] +data_dir = sys.argv[2] -attributes_file = os.path.join(data_dir, 'attributes') -rules_file = os.path.join(data_dir, 'rules') -users_file = os.path.join(data_dir, 'users-test') +pid_file = os.path.join(data_dir, 'pid') +attributes_file = os.path.join(data_dir, 'attributes.tab') +rules_file = os.path.join(data_dir, 'rules.txt') +users_file = os.path.join(data_dir, 'users-test.txt') programs_file = os.path.join(data_dir, 'programs.pickle') +pid = int(open(pid_file, 'r').read().strip()) # read test results for known programs test = pickle.load(open(programs_file, 'rb')) @@ -35,20 +61,15 @@ for solution in pickle.load(open(solutions_file, 'rb')): # read attributes attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()]) - -class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])): - def __str__(self): - s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality) - s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition]) - return s +attributes_ordered = [line.strip().split('\t')[1] for line in open(attributes_file, 'r').readlines()] # read rules rules = [] for line in open(rules_file, 'r').readlines(): - match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip()) + match = re.match(r'IF ((?:a[0-9]*!=F(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip()) if match: m = tuple(match.groups()) - condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND ')) + condition = tuple(attributes[field[:-3]] for field in m[0].split(' AND ')) rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1]))) #print(rules[-1]) else: @@ -65,161 +86,166 @@ def color_print(text, ranges): i = start + length print(text[i:]) -# check if given patterns match the rule -def check_rule(rule, patterns): - ret_patterns = [] - for rule_pattern, yes in rule.condition: - if yes: - # this pattern must be present - for pattern, nodes in patterns: - if pattern == rule_pattern: - ret_patterns.append((rule_pattern, nodes)) - else: - # this pattern must not be present - if rule_pattern in [p[0] for p in patterns]: - return [] - return ret_patterns +# generate marks for selected patterns for color_print +def mark(patterns, selected, color): + marks = set() + for pattern, nodes in patterns: + if pattern in selected: + marks |= set((n[0].pos, len(n[0].val), color) for n in nodes if n[0].pos) + return marks + +# return a hint for the best applicable buggy rule +def suggest_buggy(rules, patterns): + for rule in [r for r in rules if r.klass == 'F']: + # suggest this rule if all patterns in condition are found in the program + if all(rule_pattern in [p[0] for p in patterns] for rule_pattern in rule.condition): + return Hint(ok=[], remove=rule.condition, add=[], add_alternatives=[]) + return None + +# return a hint for the best applicable true rule +def suggest_true(rules, patterns): + # get match info for all true rules + rule_matches = collections.defaultdict(list) + for rule in [r for r in rules if r.klass == 'T']: + found = set() + missing = set() + for rule_pattern in rule.condition: + if any(pattern == rule_pattern for pattern, nodes in patterns): + found.add(rule_pattern) + else: + missing.add(rule_pattern) + if missing: + rule_matches[len(found)].append((found, missing)) + + # return rules with most matching patterns + for i in range(10, 0, -1): + if i not in rule_matches: + continue + missing_patterns = collections.Counter() + for found, missing in rule_matches[i]: + for pattern in missing: + missing_patterns[pattern] += 1 + + best_missing_patterns = [] + for missing_pattern, count in missing_patterns.most_common(): + if count == missing_patterns.most_common()[0][1]: + best_missing_patterns.append(missing_pattern) + else: + break -# keep track of when each suggestion was applied -all_suggestions = [] -# programs with no matching rule -unsuggestable = collections.Counter() + add = [] + for pattern in attributes_ordered: + if pattern in best_missing_patterns: + add = [pattern] + break + add_alternatives = [pattern for pattern, _ in missing_patterns.most_common() if pattern not in add] + return Hint(ok=[], remove=[], add=add, add_alternatives=add_alternatives) + + return None +# evaluate hints on student traces +submissions = collections.defaultdict(list) for user, trace in traces.items(): - # get submissions from trace - programs = [] + # get submissions for this user + user_submissions = [] code = '' for action in parse_trace(trace): code = action.apply(code) if action.type == 'test': + # skip syntactically incorrect submissions if prolog_parse(code) is None: continue + normalized_code = stringify(rename_vars_list(tokenize(code))) - if programs and normalized_code == programs[-1][0]: + # skip repeated submissions + if user_submissions and normalized_code == user_submissions[-1].program: + continue + # skip submissions without cached test results + if normalized_code not in test: continue correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed'] - programs.append((normalized_code, correct)) - # ignore actions after first correct submission + + # check rules for this submission + program_patterns = list(get_patterns(normalized_code)) + hint = suggest_buggy(rules, program_patterns) + if not hint: + hint = suggest_true(rules, program_patterns) + user_submissions.append(Submission(normalized_code, correct, program_patterns, hint)) + + # skip submissions after the first correct program if correct: break # ignore traces with no / only correct submissions - if not any(p[1] for p in programs) or all(p[1] for p in programs): + if (not any(s.correct for s in user_submissions) or + all(s.correct for s in user_submissions)): continue - suggested = [] - for i, (program, correct) in enumerate(programs): - program_patterns = list(get_patterns(program)) - #for p in program_patterns: - # print(p[0]) - #print() - - # check if previously suggested rules match - for s in suggested: - s['passed'] += 1 - match = check_rule(s['rule'], program_patterns) - if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or - s['rule'].klass == 'F' and not match): - s['matched'].append(s['passed']) - - # only check programs until first correct submission - if correct: - print(str(i) + ' PASS\t' + program) - print() - break - - # check rules in order, buggy rules first - found = False - for rule in ( - [r for r in rules if r.klass == 'F'] + - [r for r in rules if r.klass == 'T']): - match = check_rule(rule, program_patterns) - if (rule.klass == 'F' and not match or - rule.klass == 'T' and len(match) != len(rule.condition)-1): - continue - found = True - - # store suggestion to see if it was implemented later - if not any(s['program'] == program and s['rule'] == rule for s in suggested): - # passed: how many submission before PASS - # matched: list of submissions where the suggested rule matched - # (the current submission has index 0, the next 1 and so on) - suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []}) - - # get highlights - highlight = set() - for m in match: - for n in m[1]: - highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red'))) - - # print highighted program - print(str(i) + ' FAIL', end='\t') - color_print(program, list(highlight)) - - # print rule - for rule_pattern, yes in rule.condition: - if rule.klass == 'T': - if rule_pattern in [pattern for pattern, nodes in program_patterns]: - print('good\t' + str(rule_pattern)) - else: - print('missing\t' + str(rule_pattern)) - else: - if rule_pattern in [pattern for pattern, nodes in program_patterns]: - print('buggy\t' + str(rule_pattern)) - print() - break - - if not found: - print(str(i) + ' FAIL\t' + str(program)) - print() - unsuggestable[program] += 1 - - print('Suggestions and versions in which they were implemented:') - for s in suggested: - index = len(programs) - (s['passed'] + 1) - print(index, [index + m for m in s['matched']]) - all_suggestions += suggested - + submissions[user] = user_submissions + + # print submissions with hints for debugging + for s in user_submissions: + print('PASS' if s.correct else 'FAIL', end='\t') + marks = [] + if s.hint and s.hint.remove: + marks = mark(s.patterns, s.hint.remove, 'red') + color_print(s.program, marks) + + if s.hint: + for x in s.hint.remove: + print('buggy\t', x) + for x in s.hint.add: + print('missing\t', x) + for x in s.hint.add_alternatives: + print('alternative\t', x) + print() print('-'*30) print() -# report -not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']] -matched = [s for s in all_suggestions if s['passed'] in s['matched']] - -# rules that did / did not match in the solution -good = collections.Counter() -bad = collections.Counter() -for s in all_suggestions: - (good if s in matched else bad)[s['rule']] += 1 +# submissions where hint pattern was implemented in the solution +good_hint = [] +# submissions where one of the alternative hint patterns was implemented in the solution +medium_hint = [] +# submissions where none of the hint patterns were implemented in the solution +bad_hint = [] +# submissions with no suggestions +no_hint = [] + +# total number of submissions +n_subs = 0 +for user, subs in submissions.items(): + solution = subs[-1] + solution_patterns = [p[0] for p in solution.patterns] + for s in subs[:-1]: + n_subs += 1 + if s.hint: + if s.hint.remove: + # buggy rule: at least one pattern should not be present in solution + if any(pattern not in solution_patterns for pattern in s.hint.remove): + good_hint.append(s) + else: + bad_hint.append(s) + else: + # true rule: all patterns should be present in solution + if all(pattern in solution_patterns for pattern in s.hint.add): + # best suggested pattern(s) match + good_hint.append(s) + elif any(pattern in solution_patterns for pattern in s.hint.add_alternatives): + # some suggested pattern(s) match + medium_hint.append(s) + else: + bad_hint.append(s) + else: + no_hint.append(s) print('Statistics') print('----------') -print('# of suggestions that were implemented:', len(matched)) -print('# of suggestions that were not implemented:', len(not_matched)) -print('avg. # of submissions before suggestion was implemented:', - sum(s['matched'][0] for s in matched)/len(matched)) -print('avg. # of submissions until PASS after suggestion was implemented:', - sum(s['passed'] - s['matched'][0] for s in matched)/len(matched)) -print('avg. # of submissions until PASS if suggestion was not implemented:', - sum(s['passed'] for s in not_matched)/len(not_matched)) -#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched)) -print() - -print('Unsuggestable programs') -print('----------------------') -for p, count in unsuggestable.most_common(): - print('{}\t{}'.format(count, p)) -print() - -print('Good rules') -print('----------') -for r, count in good.most_common(): - print('Suggested for ' + str(count) + ' submissions') - print(r) - -print('Bad rules') -print('---------') -for r, count in bad.most_common(): - print('Suggested for ' + str(count) + ' submissions') - print(r) +print('avg. submissions per trace:', mean(len(subs) for subs in submissions.values())) +print('avg. clauses in solution:', mean(subs[-1].program.count('.') for subs in submissions.values())) +print('total submissions:', n_subs) +print('positive hints (best implemented):', len([s for s in good_hint if s.hint.add])) +print('positive hints (alternative implemented):', len([s for s in medium_hint if s.hint.add_alternatives])) +print('positive hints (not implemented):', len([s for s in bad_hint if s.hint.add])) +print('buggy hints (implemented):', len([s for s in good_hint if s.hint.remove])) +print('buggy hints (not implemented):', len([s for s in bad_hint if s.hint.remove])) +print('no hints:', len(no_hint)) |