main.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

import argparse
import ast
import collections
import os

import pandas
import sklearn.dummy
import sklearn.ensemble
import sklearn.model_selection
import sklearn.tree

import canonicalize
import dynamic
import regex

def get_programs(path: str, names: str, do_canonicalize: bool = False):
    """Load progams from path."""
    programs = {}

    users = sorted([int(uid) for uid in os.listdir(path) if uid.isdigit()])
    for user in users:
        user_dir = os.path.join(path, str(user), 'submissions')
        user_subs = set()

        for submission in os.listdir(user_dir):
            with open(os.path.join(user_dir, submission), 'r') as f:
                try:
                    code = f.read().strip()				
                except:
                    continue
            # skip syntactically incorrect programs
            try:
                ast.parse(code)
            except:
                continue

            # canonicalize
            original = code
            if do_canonicalize:
                code = canonicalize.canonicalize(code, given_names=names)

            # remember
            if code in user_subs:
                continue
            user_subs.add(code)

            seq, total, passed = submission.split('-')
            if code not in programs:
                programs[code] = {'users': set(), 'correct': total == passed, 'original': original}
            programs[code]['users'].add(user)

    return programs

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get patterns from student programs.')
    parser.add_argument('path', help='path to data directory')
    parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized')
    parser.add_argument('--exec', required=False, help='code to append for dynamic patterns')
    parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns')

    args = parser.parse_args()
    path = args.path.rstrip('/')
    problem_name = os.path.basename(path)

    programs = get_programs(path, args.names, do_canonicalize=True)

    attrs = collections.OrderedDict()
    attrs.update(regex.get_attributes(programs))
    attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs))

    #print('Attributes:')
    #for attr in attrs:
    #    print(attr, attrs[attr]['desc'].to_string(inline=True))

    for program in programs:
        for attr in attrs:
            programs[program][attr] = program in attrs[attr]['programs']
    data = pandas.DataFrame.from_dict(programs, orient='index')

    train = data.sample(frac=0.7, random_state=0)
    Y = train['correct']
    X = train.drop(['users', 'correct', 'original'], axis='columns')
    X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.33, random_state=0)

    learners = collections.OrderedDict([
        ('major', sklearn.dummy.DummyClassifier()),
        ('tree', sklearn.tree.DecisionTreeClassifier()),
        ('rf', sklearn.ensemble.RandomForestClassifier(n_estimators=100)),
    ])

    for name, learner in learners.items():
        scores = sklearn.model_selection.cross_val_score(learner, X, Y, cv=10)
        print('{}:\t{}'.format(name, scores.mean()))