summaryrefslogtreecommitdiff
path: root/create_data.py
blob: 2296727f14e6f52a39be354e93c36e4e7287dda4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
import ast
import collections
import os

import pandas
import sklearn.dummy
import sklearn.ensemble
import sklearn.model_selection
import sklearn.tree

import canonicalize
import dynamic
import regex

import main
import Orange


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get patterns from student programs.')
    parser.add_argument('filename', help='Orange file name')
    parser.add_argument('path', help='path to data directory')
    parser.add_argument('--dynamic', action="store_true", help='include dynamic atts')
    parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized')
    parser.add_argument('--exec', required=False, help='code to append for dynamic patterns')
    parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns')

    args = parser.parse_args()
    path = args.path.rstrip('/')
    problem_name = os.path.basename(path)

    programs = main.get_programs(path, args.names, do_canonicalize=True)

    attrs = collections.OrderedDict()
    attrs.update(regex.get_attributes(programs))
    if args.dynamic:
        attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs))
    
    orange_attrs = []
    for at in attrs:
        orange_attrs.append(Orange.data.DiscreteVariable(at, values=('F', 'T')))

    cl = Orange.data.DiscreteVariable('correct', values=('F', 'T'))
    mcode = Orange.data.StringVariable('code')
    orange_domain = Orange.data.Domain(orange_attrs, cl, metas=[mcode])
    orange_data = Orange.data.Table.from_domain(orange_domain)
    for program in programs:
        if not program:
            continue
        instance = Orange.data.Instance(orange_domain)
        for at in attrs:
            instance[at] = program in attrs[at]['programs']
        instance[cl] = programs[program]['correct']
        instance[mcode] = program
        for _ in range(len(programs[program]['users'])):
            orange_data.append(instance)
   
    orange_data.save(args.filename)

    fatt = open("attributes.txt", "wt")
    for at in attrs:
        fatt.write("{}: {}\n".format(at, str(attrs[at]["desc"]).replace('\n',' ')))