# CodeQ: an online programming tutor.
# Copyright (C) 2016,2017 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
import collections
from itertools import chain, combinations, product
import pickle
import random
import sys
from nltk import ParentedTree, Tree
from prolog.util import parse as prolog_parse
# construct pattern to match the structure of nodes given by [include],
# supports variables and literals
def pattern(node, include):
if not isinstance(node, Tree):
return None
label = node.label()
if any(n is node for n in include):
if label == 'literal':
return '"{}"'.format(node[0].val)
if label == 'variable':
return '{}'.format(label)
return None
if label == 'functor':
return '({} "{}")'.format(label, node[0].val)
subpats = [pattern(child, include) for child in node]
pat = None
if any(subpats):
if label == 'and':
if subpats[1]:
pat = subpats[1]
if subpats[0]:
if pat:
pat = subpats[0] + ' ' + pat
else:
pat = subpats[0]
elif label == 'args':
pat = label
for i, subpat in enumerate(subpats):
if subpat:
pat += ' {}'.format(subpat)
pat = '(' + pat + ')'
elif label == 'unop':
pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')'
elif label == 'binop':
pat = label
if subpats[0]:
pat += ' {}'.format(subpats[0])
pat += ' "{}"'.format(node[1].val)
if subpats[2]:
pat += ' {}'.format(subpats[2])
pat = '(' + pat + ')'
elif label == 'clause':
pat = label
for i, subpat in enumerate(subpats):
if subpat:
pat += ' {}'.format(subpats[i])
return '(' + pat + ')'
elif label == 'compound':
if len(subpats) > 1 and subpats[1]:
pat = label
for i, subpat in enumerate(subpats):
pat += ' {}'.format(subpat)
pat = '(' + pat + ')'
else:
return None
elif label == 'head':
pat = label
pat += ' {}'.format(subpats[0])
pat = '(' + pat + ')'
elif label == 'list':
pat = 'list '
if subpats[0]:
pat += '(h {})'.format(subpats[0])
if subpats[0] and subpats[1]:
pat += ' '
if subpats[1]:
pat += '(t {})'.format(subpats[1])
pat = '(' + pat + ')'
if not pat:
for s in subpats:
if s:
pat = s
break
return pat
def get_patterns(tree):
if isinstance(tree, str):
tree = prolog_parse(tree)
if tree is None:
return
tree = ParentedTree.convert(tree)
# get patterns separately for each clause
for clause in tree:
# collect variable nodes in this clause
variables = collections.defaultdict(list)
for node in clause.subtrees():
if isinstance(node, Tree) and node.label() == 'variable':
name = node[0].val
variables[name].append(node)
# yield patterns for singleton variables
for var, nodes in variables.items():
if len(nodes) == 1:
pat = pattern(clause, nodes)
if pat:
yield pat, nodes
# yield patterns for variable-variable pairs (within a clause)
for var, nodes in variables.items():
for selected in combinations(nodes, 2):
pat = pattern(clause, selected)
if pat:
yield pat, selected
# yield patterns for variable-literal / literal-literal pairs
# yield patterns for singleton literals
# (only within a topmost compound / binop / unop)
def patterns_with_literals(node):
if not isinstance(node, Tree):
return
if node.label() in {'compound', 'binop', 'unop'}:
vars = [n for n in node.subtrees() if n.label() == 'variable']
lits = [n for n in node.subtrees() if n.label() == 'literal']
for selected in chain(combinations(lits, 1),
combinations(lits, 2),
product(lits, vars)):
pat = pattern(clause, selected)
if pat:
yield pat, selected
else:
for child in node:
yield from patterns_with_literals(child)
yield from patterns_with_literals(clause)
# Extract edits and other data from existing traces for each problem.
# Run with: python3 -m monkey.patterns
if __name__ == '__main__':
pid = int(sys.argv[1])
name = sys.argv[2]
submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))
# find test/train users
users = sorted({user for code, info in submissions.items() for user in info['users']})
random.Random(0).shuffle(users)
split = int(len(users)*0.7)
learn_users = set(users[:split])
test_users = set(users[split:])
# save test users to file
with open('data/users-test-{}.txt'.format(pid), 'wt') as f:
for user in test_users:
print(user, file=f)
# find test/train programs
data = {
'train': [],
'test': []
}
for code, info in submissions.items():
if len(code) > 1000 or prolog_parse(code) is None:
continue
if name not in code:
continue
data['train'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & learn_users)
data['test'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & test_users)
# print info about test users and test/train programs
print('Test users:')
print(test_users)
print()
for which in ['train', 'test']:
print('Programs ({}):'.format(which))
print('correct: {} ({} unique)'.format(
len([code for code, correct in data[which] if correct]),
len({code for code, correct in data[which] if correct})))
print('incorrect: {} ({} unique)'.format(
len([code for code, correct in data[which] if not correct]),
len({code for code, correct in data[which] if not correct})))
print()
# extract attributes from training data
patterns = collections.Counter()
for code, correct in data['train']:
for pat, nodes in get_patterns(code):
patterns[pat] += 1
attrs = []
with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file:
for i, (pat, count) in enumerate(patterns.most_common()):
if count < 5:
break
attrs.append(pat)
print('a{}\t{}'.format(i, pat), file=pattern_file)
# check and write attributes for training/test data
for t in ['train', 'test']:
with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f:
# print header
print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
print('\t'.join(['d'] * (len(attrs)+2)), file=f)
print('meta\tclass', file=f)
# print rows (program, correct, attr1, attr2, …)
for code, correct in data[t]:
record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
code_pats = [pat for pat, nodes in get_patterns(code)]
for pat in attrs:
record += '\t{}'.format('T' if pat in code_pats else 'F')
print(record, file=f)