import argparse import ast import collections import os import pandas import sklearn.dummy import sklearn.ensemble import sklearn.model_selection import sklearn.tree import canonicalize import dynamic import regex def get_programs(path: str, names: str, do_canonicalize: bool = False): """Load progams from path.""" programs = {} users = sorted([int(uid) for uid in os.listdir(path) if uid.isdigit()]) for user in users: user_dir = os.path.join(path, str(user), 'submissions') user_subs = set() for submission in os.listdir(user_dir): with open(os.path.join(user_dir, submission), 'r') as f: code = f.read().strip() # skip syntactically incorrect programs try: ast.parse(code) except: continue # canonicalize original = code if do_canonicalize: code = canonicalize.canonicalize(code, given_names=names) # remember if code in user_subs: continue user_subs.add(code) seq, total, passed = submission.split('-') if code not in programs: programs[code] = {'users': set(), 'correct': total == passed, 'original': original} programs[code]['users'].add(user) return programs if __name__ == '__main__': parser = argparse.ArgumentParser(description='Get patterns from student programs.') parser.add_argument('path', help='path to data directory') parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized') parser.add_argument('--exec', required=False, help='code to append for dynamic patterns') parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns') args = parser.parse_args() path = args.path.rstrip('/') problem_name = os.path.basename(path) programs = get_programs(path, args.names, do_canonicalize=True) attrs = collections.OrderedDict() attrs.update(regex.get_attributes(programs)) attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs)) print('Attributes:') for attr in attrs: print(attr, attrs[attr]['desc'].to_string(inline=True)) for program in programs: for attr in attrs: programs[program][attr] = program in attrs[attr]['programs'] data = pandas.DataFrame.from_dict(programs, orient='index') train = data.sample(frac=0.7, random_state=0) Y = train['correct'] X = train.drop(['users', 'correct', 'original'], axis='columns') X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.33, random_state=0) learners = collections.OrderedDict([ ('major', sklearn.dummy.DummyClassifier()), ('tree', sklearn.tree.DecisionTreeClassifier()), ('rf', sklearn.ensemble.RandomForestClassifier(n_estimators=100)), ]) for name, learner in learners.items(): scores = sklearn.model_selection.cross_val_score(learner, X, Y, cv=10) print('{}:\t{}'.format(name, scores.mean()))