summaryrefslogtreecommitdiff
path: root/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'main.py')
-rw-r--r--main.py84
1 files changed, 84 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..610901f
--- /dev/null
+++ b/main.py
@@ -0,0 +1,84 @@
+import argparse
+import ast
+import collections
+import os
+
+import pandas
+import sklearn.dummy
+import sklearn.ensemble
+import sklearn.model_selection
+import sklearn.tree
+
+import canonicalize
+import dynamic
+import regex
+
+def get_programs(path: str, names: str, do_canonicalize: bool = False):
+ """Load progams from path."""
+ programs = {}
+
+ users = sorted([int(uid) for uid in os.listdir(path) if uid.isdigit()])
+ for user in users:
+ user_dir = os.path.join(path, str(user), 'submissions')
+ user_subs = set()
+
+ for submission in os.listdir(user_dir):
+ with open(os.path.join(user_dir, submission), 'r') as f:
+ code = f.read().strip()
+
+ # skip syntactically incorrect programs
+ try:
+ ast.parse(code)
+ except:
+ continue
+
+ # canonicalize
+ if do_canonicalize:
+ code = canonicalize.canonicalize(code, given_names=names)
+
+ # remember
+ if code in user_subs:
+ continue
+ user_subs.add(code)
+
+ seq, total, passed = submission.split('-')
+ if code not in programs:
+ programs[code] = {'users': set(), 'correct': total == passed}
+ programs[code]['users'].add(user)
+
+ return programs
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Get patterns from student programs.')
+ parser.add_argument('path', help='path to data directory')
+ parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized')
+ parser.add_argument('--exec', required=False, help='code to append for dynamic patterns')
+ parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns')
+
+ args = parser.parse_args()
+ path = args.path.rstrip('/')
+ problem_name = os.path.basename(path)
+
+ programs = get_programs(path, args.names, do_canonicalize=True)
+
+ attrs = collections.OrderedDict()
+ attrs.update(regex.get_attributes(programs))
+ attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs))
+ print('Attributes:', attrs.keys())
+
+ for program in programs:
+ for attr in attrs:
+ programs[program][attr] = program in attrs[attr]['programs']
+ data = pandas.DataFrame.from_dict(programs, orient='index')
+ y = data['correct']
+ X = data.drop(['users', 'correct'], axis='columns')
+
+ learners = collections.OrderedDict([
+ ('major', sklearn.dummy.DummyClassifier()),
+ ('tree', sklearn.tree.DecisionTreeClassifier()),
+ ('rf', sklearn.ensemble.RandomForestClassifier()),
+ ])
+
+ for name, learner in learners.items():
+ scores = sklearn.model_selection.cross_val_score(learner, X, y, cv=10)
+ print('{}:\t{}'.format(name, scores.mean()))