import argparse import ast import collections import os import pandas import sklearn.dummy import sklearn.ensemble import sklearn.model_selection import sklearn.tree import canonicalize import dynamic import regex def get_programs(path: str, names: str, do_canonicalize: bool = False): """Load progams from path.""" programs = {} users = sorted([int(uid) for uid in os.listdir(path) if uid.isdigit()]) for user in users: user_dir = os.path.join(path, str(user), 'submissions') user_subs = set() for submission in os.listdir(user_dir): with open(os.path.join(user_dir, submission), 'r') as f: code = f.read().strip() # skip syntactically incorrect programs try: ast.parse(code) except: continue # canonicalize if do_canonicalize: code = canonicalize.canonicalize(code, given_names=names) # remember if code in user_subs: continue user_subs.add(code) seq, total, passed = submission.split('-') if code not in programs: programs[code] = {'users': set(), 'correct': total == passed} programs[code]['users'].add(user) return programs if __name__ == '__main__': parser = argparse.ArgumentParser(description='Get patterns from student programs.') parser.add_argument('path', help='path to data directory') parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized') parser.add_argument('--exec', required=False, help='code to append for dynamic patterns') parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns') args = parser.parse_args() path = args.path.rstrip('/') problem_name = os.path.basename(path) programs = get_programs(path, args.names, do_canonicalize=True) attrs = collections.OrderedDict() attrs.update(regex.get_attributes(programs)) attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs)) print('Attributes:', attrs.keys()) for program in programs: for attr in attrs: programs[program][attr] = program in attrs[attr]['programs'] data = pandas.DataFrame.from_dict(programs, orient='index') y = data['correct'] X = data.drop(['users', 'correct'], axis='columns') learners = collections.OrderedDict([ ('major', sklearn.dummy.DummyClassifier()), ('tree', sklearn.tree.DecisionTreeClassifier()), ('rf', sklearn.ensemble.RandomForestClassifier()), ]) for name, learner in learners.items(): scores = sklearn.model_selection.cross_val_score(learner, X, y, cv=10) print('{}:\t{}'.format(name, scores.mean()))