diff options
Diffstat (limited to 'main.py')
-rw-r--r-- | main.py | 84 |
1 files changed, 84 insertions, 0 deletions
@@ -0,0 +1,84 @@ +import argparse +import ast +import collections +import os + +import pandas +import sklearn.dummy +import sklearn.ensemble +import sklearn.model_selection +import sklearn.tree + +import canonicalize +import dynamic +import regex + +def get_programs(path: str, names: str, do_canonicalize: bool = False): + """Load progams from path.""" + programs = {} + + users = sorted([int(uid) for uid in os.listdir(path) if uid.isdigit()]) + for user in users: + user_dir = os.path.join(path, str(user), 'submissions') + user_subs = set() + + for submission in os.listdir(user_dir): + with open(os.path.join(user_dir, submission), 'r') as f: + code = f.read().strip() + + # skip syntactically incorrect programs + try: + ast.parse(code) + except: + continue + + # canonicalize + if do_canonicalize: + code = canonicalize.canonicalize(code, given_names=names) + + # remember + if code in user_subs: + continue + user_subs.add(code) + + seq, total, passed = submission.split('-') + if code not in programs: + programs[code] = {'users': set(), 'correct': total == passed} + programs[code]['users'].add(user) + + return programs + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get patterns from student programs.') + parser.add_argument('path', help='path to data directory') + parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized') + parser.add_argument('--exec', required=False, help='code to append for dynamic patterns') + parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns') + + args = parser.parse_args() + path = args.path.rstrip('/') + problem_name = os.path.basename(path) + + programs = get_programs(path, args.names, do_canonicalize=True) + + attrs = collections.OrderedDict() + attrs.update(regex.get_attributes(programs)) + attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs)) + print('Attributes:', attrs.keys()) + + for program in programs: + for attr in attrs: + programs[program][attr] = program in attrs[attr]['programs'] + data = pandas.DataFrame.from_dict(programs, orient='index') + y = data['correct'] + X = data.drop(['users', 'correct'], axis='columns') + + learners = collections.OrderedDict([ + ('major', sklearn.dummy.DummyClassifier()), + ('tree', sklearn.tree.DecisionTreeClassifier()), + ('rf', sklearn.ensemble.RandomForestClassifier()), + ]) + + for name, learner in learners.items(): + scores = sklearn.model_selection.cross_val_score(learner, X, y, cv=10) + print('{}:\t{}'.format(name, scores.mean())) |