From ca587416e17b82f3816f187ead5b186f5c1bc126 Mon Sep 17 00:00:00 2001
From: Timotej Lazar <timotej.lazar@fri.uni-lj.si>
Date: Tue, 18 Apr 2017 17:26:45 +0200
Subject: Add script to extract patterns from submissions

---
 get-attributes.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 get-attributes.py
diff --git a/get-attributes.py b/get-attributes.py
new file mode 100644
index 0000000..78258a3
--- /dev/null
+++ b/get-attributes.py
@@ -0,0 +1,97 @@
+import argparse
+import collections
+import os
+import random
+
+from monkey.patterns import get_patterns
+from prolog.util import parse as prolog_parse
+
+parser = argparse.ArgumentParser(description='Get patterns from student programs.')
+parser.add_argument('path', help='path to data directory')
+args = parser.parse_args()
+path = args.path.rstrip('/')
+
+name = os.path.basename(path)
+submissions = os.path.join(path, 'submissions')
+
+# select test/train users
+users = sorted([int(uid) for uid in os.listdir(submissions)])
+random.Random(0).shuffle(users)
+split = int(len(users)*0.7)
+learn_users = set(users[:split])
+test_users = set(users[split:])
+
+# save test users to file
+with open(path + '/users-test.txt', 'wt') as f:
+    for user in test_users:
+        print(user, file=f)
+
+# find test/train programs
+data = {
+    'train': [],
+    'test': []
+}
+for user in users:
+    user_dir = os.path.join(submissions, str(user))
+    user_subs = set()
+
+    # each submission is in a file named <seq. no>-<total tests>-<passed tests>
+    for submission in os.listdir(user_dir):
+        with open(os.path.join(user_dir, submission), 'r') as f:
+            code = f.read()
+
+        if code in user_subs:  # do not add a program twice for the same user
+            continue
+        user_subs.add(code)
+
+        if prolog_parse(code) is None:  # skip syntactically incorrect programs
+            continue
+        if name not in code:  # only add programs with defined predicate
+            continue
+
+        seq, total, passed = submission.split('-')
+        data['test' if user in test_users else 'train'].append((code, total == passed))
+
+# print info about test users and test/train programs
+print('Test users:')
+print(test_users)
+print()
+for which in ['train', 'test']:
+    print('Programs ({}):'.format(which))
+    print('correct: {} ({} unique)'.format(
+        len([code for code, correct in data[which] if correct]),
+        len({code for code, correct in data[which] if correct})))
+    print('incorrect: {} ({} unique)'.format(
+        len([code for code, correct in data[which] if not correct]),
+        len({code for code, correct in data[which] if not correct})))
+    print()
+
+# extract attributes from training data
+patterns = collections.Counter()
+for code, correct in data['train']:
+    for pat, nodes in get_patterns(code):
+        patterns[pat] += 1
+
+attrs = []
+with open(path + '/attributes.tab', 'w') as pattern_file:
+    for i, (pat, count) in enumerate(patterns.most_common()):
+        if count < 5:
+            break
+        attrs.append(pat)
+        print('a{}\t{}'.format(i, pat), file=pattern_file)
+
+# check and write attributes for training/test data
+for t in ['train', 'test']:
+    with open(path + '/programs-{}.tab'.format(t), 'w') as f:
+        # print header
+        print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
+        print('\t'.join(['d'] * (len(attrs)+2)), file=f)
+        print('meta\tclass', file=f)
+
+        # print rows (program, correct, attr1, attr2, …)
+        for code, correct in data[t]:
+            record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
+            code_pats = [pat for pat, nodes in get_patterns(code)]
+            for pat in attrs:
+                record += '\t{}'.format('T' if pat in code_pats else 'F')
+            print(record, file=f)
-- 
cgit v1.2.1