From 27d4458613a5b61f16ad9bf59ca1de460fea3b3a Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Mon, 9 Jan 2017 18:07:23 +0100 Subject: First commit is the best commit --- monkey/__init__.py | 0 monkey/action.py | 239 +++++++++++++++++++++++++ monkey/patterns.py | 511 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 750 insertions(+) create mode 100644 monkey/__init__.py create mode 100644 monkey/action.py create mode 100644 monkey/patterns.py (limited to 'monkey') diff --git a/monkey/__init__.py b/monkey/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/monkey/action.py b/monkey/action.py new file mode 100644 index 0000000..c2f73ad --- /dev/null +++ b/monkey/action.py @@ -0,0 +1,239 @@ +#!/usr/bin/python3 + +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import sys + +class Action: + def __init__(self, abstime, data): + self.type = data['typ'] + self.time = abstime # time from start + + # generic actions + if self.type == 'open': + self.timestamp = data['time'] + elif self.type == 'ins': + self.type = 'insert' + self.offset = data['off'] + self.text = data['txt'] + self.length = len(self.text) + elif self.type == 'rm': + self.type = 'remove' + self.offset = data['off'] + self.text = data['txt'] + self.length = len(self.text) + elif self.type == 'test': + # data['feedback'] is a 'test_results' hint object, or a list + # containing such hint + hint = None + if isinstance(data['feedback'], list): + for hint in data['feedback']: + if hint['id'] == 'test_results': + break + else: + hint = data['feedback'] + if hint is not None: + self.n_correct = hint['args']['passed'] + self.n_all = hint['args']['total'] + else: + self.n_correct = self.n_all = None + elif self.type == 'hint': + self.feedback = data['feedback'] + elif self.type == 'hnt': + # obsolete Prolog hint action, with no additional info + self.type = 'hint' + self.feedback = None + elif self.type == 'plan': + self.index = data.get('index') + + # Prolog actions + elif self.type == 'prolog_solve': + self.query = data['query'] + elif self.type == 'slva': + # obsolete Prolog "solve all" action + self.type = 'prolog_solve' + self.query = data['qry'] + elif self.type == 'experiment': + self.data = data['data'] + + # Python actions + elif self.type == 'python_input': + self.text = data['txt'] + elif self.type == 'python_run': + self.program = data['program'] + + # robot actions + elif self.type == 'robot_run': + self.program = data['program'] + + def __str__(self): + s = 't = ' + str(self.time/1000.0) + ' ' + self.type + if self.type in {'insert', 'remove'}: + s += ': "' + self.text.replace('\n', '\\n').replace('\t', '\\t') + '" at ' + str(self.offset) + elif self.type == 'test': + s += ': {0} / {1}'.format(self.n_correct, self.n_all) + elif self.type == 'hint': + if self.feedback is not None: + s += ': ' + ', '.join(sorted([hint['id'] for hint in self.feedback])) + else: + s += ': ?' + elif self.type == 'plan': + if self.index is not None: + s += ': ' + str(self.index) + elif self.type == 'prolog_solve': + s += ': "' + self.query + '"' + elif self.type == 'experiment': + s += ': ' + self.data; + return s + + # apply this action to text + def apply(self, text): + if self.type == 'insert': + return text[:self.offset] + self.text + text[self.offset:] + elif self.type == 'remove': + return text[:self.offset] + text[self.offset+self.length:] + else: + return text + + # reverse the application of this action + def unapply(self, text): + if self.type == 'insert': + return text[:self.offset] + text[self.offset+self.length:] + elif self.type == 'remove': + return text[:self.offset] + self.text + text[self.offset:] + else: + return text + +# parse log from database into a list of actions, cleaning up some fluff. +# ignore non-text actions (queries and tests) +def parse(data): + if data == None: + return [], [] + + actions = [] + incorrect = set() + + time = 0 + code = '' + for packet in data: + try: + time += packet['dt'] + action = Action(time, packet) + except: + # ignore any errors while decoding a packet + sys.stderr.write('Error decoding packet: {}\n'.format(packet)) + continue + + # skip normalization if this is the first action + if actions == []: + actions.append(action) + code = action.apply(code) + continue + + # add to list of actions; modify previously added action if necessary + prev = actions[-1] + + # remove superfluous REMOVE action when newline is inserted (due to editor auto-indent) + if prev.type == 'remove' and action.type == 'insert' and \ + action.time == prev.time and \ + action.offset == prev.offset and action.length > prev.length and \ + action.text[action.length-prev.length:] == prev.text: + # discard last REMOVE action + code = prev.unapply(code) + actions.pop() + + # replace current action with something better + length = action.length - prev.length + new = Action(prev.time, {'typ': 'ins', 'off': prev.offset, 'txt': action.text[:length]}) + actions.append(new) + code = new.apply(code) + + # remove superfluous INSERT action when newline is removed (due to editor auto-indent) + elif prev.type == 'remove' and action.type == 'insert' and \ + action.time == prev.time and \ + action.offset == prev.offset and action.length < prev.length and \ + prev.text[prev.length-action.length:] == action.text: + # discard last INSERT action + code = prev.unapply(code) + actions.pop() + + # replace current action with something better + length = prev.length - action.length + new = Action(prev.time, {'typ': 'rem', 'off': prev.offset, 'txt': prev.text[:length]}) + actions.append(new) + code = new.apply(code) + + # otherwise, simply append the current action + else: + actions.append(action) + code = action.apply(code) + + return actions + +# expand any multi-char actions (does not do anything for the 99%) +def expand(actions): + i = 0 + while i < len(actions): + if actions[i].type == 'insert' and len(actions[i].text) > 1: + a = actions.pop(i) + for offset in range(len(a.text)): + actions.insert(i+offset, + Action(a.time, {'typ': 'ins', 'off': a.offset+offset, 'txt': a.text[offset]})) + i += len(a.text) + elif actions[i].type == 'remove' and len(actions[i].text) > 1: + a = actions.pop(i) + for offset in range(len(a.text)): + actions.insert(i, + Action(a.time, {'typ': 'rm', 'off': a.offset+offset, 'txt': a.text[offset]})) + i += len(a.text) + else: + i += 1 + +# some sample code +if __name__ == '__main__': + from db.models import Problem, Solution + + # print all problem ids + print('problems:') + for problem in Problem.list(): + print(' {}\t{}'.format(problem.id, problem.identifier)) + print() + + pid = input('enter problem id: ') + + # print all attempt ids for the selected problem + print('users solving problem ' + str(pid) + ':') + attempts = Solution.filter(problem_id=pid) + print(', '.join([str(attempt.codeq_user_id) for attempt in attempts])) + print() + + uid = input('enter user id: ') + attempt = Solution.get(problem_id=pid, codeq_user_id=uid) + + try: + actions = parse(attempt.trace) + print('read ' + str(len(actions)) + ' actions from log') + + print('code versions for this attempt:') + code = '' + for action in actions: + code = action.apply(code) + print(action) + print(code.strip()) + print() + except Exception as ex: + sys.stderr.write('Error parsing action log: ' + str(ex)) diff --git a/monkey/patterns.py b/monkey/patterns.py new file mode 100644 index 0000000..ae958a0 --- /dev/null +++ b/monkey/patterns.py @@ -0,0 +1,511 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import collections +from itertools import combinations +import pickle +import random +import re +import sys + +from nltk import ParentedTree, Tree +from nltk.tgrep import tgrep_positions + +from prolog.util import Token, parse as prolog_parse + +# construct pattern to match the structure of nodes given by [include] +def pattern(node, include): + if isinstance(node, Token): + if node.type == 'NAME': + return '"{}"'.format(node.val) + return None + + if any(n is node for n in include): + if node.label() == 'variable': + return 'variable <: "{}"'.format(node[0].val) + return None + + label = node.label() + subpats = [pattern(child, include) for child in node] + + if label == 'functor': + return '{} <1 ({})'.format(label, subpats[0]) + + pat = None + if any(subpats): + if label == 'and': + if subpats[1]: + pat = subpats[1] + if subpats[0]: + if pat: + pat = subpats[0] + ' .. (' + pat + ')' + else: + pat = subpats[0] + elif label == 'args': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' <{} ({})'.format(i+1, subpat) + elif label == 'binop': + pat = label + if subpats[0]: + pat += ' <1 ({})'.format(subpats[0]) + pat += ' <2 ("{}")'.format(node[1].val) + if subpats[2]: + pat += ' <3 ({})'.format(subpats[2]) + elif label == 'clause': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i]) + elif label == 'compound': + if len(subpats) > 1 and subpats[1]: + pat = label + for i, subpat in enumerate(subpats): + pat += ' <{} ({})'.format(i+1, subpat) + else: + return None + elif label == 'head': + pat = label + pat += ' <1 ({})'.format(subpats[0]) + if not pat: + for s in subpats: + if s: + pat = s + break + return pat + +# construct a pattern connecting (variable) node [a] to [b] +def connect(a, b): + path_a = [] + node = a + while node.parent(): + node = node.parent() + path_a.insert(0, node) + + path_b = [] + node = b + while node.parent(): + node = node.parent() + path_b.insert(0, node) + + common_ancestor = None + for i in range(min(len(path_a), len(path_b))): + if path_a[i] is not path_b[i]: + break + if path_a[i].label() in {'compound', 'head'}: + break + common_ancestor = path_a[i] + + def node_label(node): + if node.label() == 'compound': + right = node[1] + nargs = 1 + while right._label == "args" and len(right) == 2: + right = right[1] + nargs += 1 + return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs) + if node.label() == 'binop': + return 'binop <2 "{}"'.format(node[1].val) + return node.label() + i = 0 + while path_a[i].label() != 'clause': + i += 1 + + # path from top to common ancestor + pat = path_a[i].label() + i += 1 + n_top = 0 + while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]: + node = path_a[i] + i += 1 + if node.label() == 'and': + continue + if node.parent().label() == 'and': + op = '<+(and)' + else: + op = '<{}'.format(node.parent_index()+1) + pat += ' {} ({}'.format(op, node_label(node)) + n_top += 1 + + path_a = path_a[i:] + path_b = path_b[i:] + + # path from common ancestor to [a] + n_a = 0 + for node in path_a: + if node.label() == 'and': + continue + op = '<' + if node.parent().label() == 'and': + op = '<+(and)' + elif node.parent_index() is not None: + op = '<{}'.format(node.parent_index()+1) + pat += ' {} ({}'.format(op, node_label(node)) + n_a += 1 + pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val) + pat += ')' * n_a + + # path from common ancestor to [b] + n_b = 0 + for node in path_b: + if node.label() == 'and': + continue + op = '<' + if node.parent().label() == 'and': + op = '<+(and)' + elif node.parent_index() is not None: + op = '<{}'.format(node.parent_index()+1) + pat += ' {} ({}'.format(op, node_label(node)) + n_b += 1 + pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val) + + pat += ')' * (n_top + n_b) + return pat + +# replace variable names with patterns and backlinks +def postprocess(pattern): + macros = '@ VAR /[A-Z]/; ' + #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern) + m = re.search(r'"[A-Z]_[0-9]+"', pattern) + nvars = 0 + while m is not None: + orig_name = m.group(0) + n = orig_name.strip('"').split("_")[1] + pat_name = 'v{}'.format(nvars) + nvars += 1 + pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():] + for m in re.finditer(orig_name, pattern): + pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():] + m = re.search(r'"([A-Z]*_[0-9]+)"', pattern) + return macros + pattern + +def postprocess_simple(pattern): + pattern = postprocess(pattern) + if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"): + return None + #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern) + relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern) + + # elements + elements = [] + current = "" + i = 0 + par = 0 + while i < len(pattern): + if par > 0 and pattern[i] == ")": + par -= 1 + if par == 0: + elements.append(current) + current = "" + if par > 0 and pattern[i] == "(": + par += 1 + if par == 0 and \ + (pattern[i:].startswith("(head") or + pattern[i:].startswith("(compound") or + pattern[i:].startswith("(binop")): + par = 1 + if par > 0: + current += pattern[i] + i += 1 + # simplify variable + for ei, e in enumerate(elements): + # #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e) + elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")" + elements = sorted(elements) + #print(pattern) + #print(relevant) + #return "_".join(relevant)#pattern + return " ".join(elements)#pattern + +# construct pattern to match the structure of nodes given by [include], +# supports variables and literals +def pattern2(node, include): + if isinstance(node, Token): + return None + + label = node.label() + if any(n is node for n in include): + if label == 'literal': + return '"{}"'.format(node[0].val) + if label == 'variable': + return '{}'.format(label) + return None + if label == 'functor': + return '({} "{}")'.format(label, node[0].val) + + subpats = [pattern2(child, include) for child in node] + pat = None + if any(subpats): + if label == 'and': + if subpats[1]: + pat = subpats[1] + if subpats[0]: + if pat: + pat = subpats[0] + ' ' + pat + else: + pat = subpats[0] + elif label == 'args': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' {}'.format(subpat) + pat = '(' + pat + ')' + elif label == 'unop': + pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')' + elif label == 'binop': + pat = label + if subpats[0]: + pat += ' {}'.format(subpats[0]) + pat += ' "{}"'.format(node[1].val) + if subpats[2]: + pat += ' {}'.format(subpats[2]) + pat = '(' + pat + ')' + elif label == 'clause': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' {}'.format(subpats[i]) + return '(' + pat + ')' + elif label == 'compound': + if len(subpats) > 1 and subpats[1]: + pat = label + for i, subpat in enumerate(subpats): + pat += ' {}'.format(subpat) + pat = '(' + pat + ')' + else: + return None + elif label == 'head': + pat = label + pat += ' {}'.format(subpats[0]) + pat = '(' + pat + ')' + elif label == 'list': + pat = 'list ' + if subpats[0]: + pat += '(h {})'.format(subpats[0]) + if subpats[0] and subpats[1]: + pat += ' ' + if subpats[1]: + pat += '(t {})'.format(subpats[1]) + pat = '(' + pat + ')' + if not pat: + for s in subpats: + if s: + pat = s + break + return pat + +def get_patterns(tree): + orig = tree + if isinstance(tree, str): + tree = prolog_parse(tree) + if tree is None: + return + tree = ParentedTree.convert(tree) + + # get patterns separately for each clause + for clause in tree: + all_variables = [] + variables = collections.defaultdict(list) + def walk(node): + if isinstance(node, Tree): + if node.label() == 'variable': + name = node[0].val + variables[name].append(node) + all_variables.append(node) + else: + for child in node: + walk(child) + walk(clause) + + # connecting pairs of nodes with same variable + for var, nodes in variables.items(): + for selected in combinations(nodes, 2): + pat = pattern2(clause, selected) + if pat: + print(pat) + yield pat, selected + #pat = connect(*selected) + #if pat: + # pp = postprocess_simple(pat) + # if pp: + # yield pp, selected + + # add singletons + for var, nodes in variables.items(): + if len(nodes) == 1: + pat = pattern2(clause, nodes) + if pat: + print(pat) + yield pat, nodes + #pat = pattern(tree, var) + #if pat: + # pp = postprocess_simple(pat) + # if pp: + # yield pp, nodes + + # add patterns for literal/variable pairs + literals = [node for node in clause.subtrees() if node.label() == 'literal'] + for literal in literals: + top = literal + while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}: + top = top.parent() + variables = [node for node in top.subtrees() if node.label() == 'variable'] + for var in variables: + pat = pattern2(clause, [var, literal]) + if pat: + yield pat, [var, literal] + +# # connecting pairs of nodes with variables +# for selected in combinations(all_variables, 2): +# pat = connect(selected[0], selected[1]) +# if pat: +# yield postprocess(pat), selected + +# # using pairs of nodes with variables +# for selected in combinations(all_variables, 2): +# pat = pattern(tree, selected) +# if pat: +# yield postprocess(pat), selected + +# # using pairs of nodes with same variable +# for var, nodes in variables.items(): +# for selected in combinations(nodes, 2): +# pat = pattern(tree, selected) +# if pat: +# yield postprocess(pat), selected + +# # using each variable separately +# for var, nodes in variables.items(): +# pat = pattern(tree, nodes) +# if pat: +# yield postprocess(pat), nodes + +# # using each goal to select variables FIXME +# goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}] +# for goal in goals: +# goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'} +# pat = pattern(tree, goal_vars) +# if pat: +# yield postprocess(pat) + +# nltk.tgrep does not play nice with non-Tree leaves +def _tgrep_prepare(tree): + if not isinstance(tree, ParentedTree): + tree = ParentedTree.convert(tree) + def prepare(node): + if isinstance(node, Token) or isinstance(node, str): + return ParentedTree(str(node), []) + return ParentedTree(node.label(), [prepare(child) for child in node]) + return prepare(tree) + +def find_motif(tree, motif): + tree = _tgrep_prepare(tree) + return tgrep_positions(motif, [tree]) + +# Extract edits and other data from existing traces for each problem. +# Run with: python3 -m monkey.patterns +if __name__ == '__main__': + # Ignore traces from these users. + ignored_users = [ + 1, # admin + 231, # test + 360, # test2 + 358, # sasha + ] + + pid = int(sys.argv[1]) + name = sys.argv[2] + submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb')) + + print('Analyzing programs for problem {}…'.format(pid)) + ndata = { + 'train': [], + 'test': [] + } + # get all users + users = set() + for code, data in submissions.items(): + users |= data['users'] + users = list(users) + users.sort() + random.Random(0).shuffle(users) + split = int(len(users)*0.7) + learn_users = set(users[:split]) + test_users = set(users[split:]) + print(test_users) + + for code, data in submissions.items(): + if len(code) > 1000 or prolog_parse(code) is None: + continue + if name not in code: + continue + ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users) + ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users) + #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users']) + + print('correct: {} ({} unique)'.format( + len([code for code, correct in ndata['train'] if correct]), + len({code for code, correct in ndata['train'] if correct}))) + print('incorrect: {} ({} unique)'.format( + len([code for code, correct in ndata['train'] if not correct]), + len({code for code, correct in ndata['train'] if not correct}))) + + #iprograms.sort() + #random.Random(0).shuffle(programs) + #split = int(len(programs)*0.7) + #data = { + # 'train': programs[:split], + # 'test': programs[split:], + #} + data = ndata + + # extract attributes from training data + patterns = collections.Counter() + for code, correct in data['train']: + for pat, nodes in get_patterns(code): + patterns[pat] += 1 + + attrs = [] + with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file: + for i, (pat, count) in enumerate(patterns.most_common()): + if count < 1: + break + attrs.append(pat) + print('a{}\t{}'.format(i, pat), file=pattern_file) + + # check and write attributes for training/test data + for t in ['train', 'test']: + with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f: + # print header + print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f) + print('\t'.join(['d'] * (len(attrs)+2)), file=f) + print('meta\tclass', file=f) + for code, correct in data[t]: + record = '{}\t{}'.format(repr(code), 'T' if correct else 'F') + + ## check attributes by using tgrep to find patterns + #tree = _tgrep_prepare(prolog_parse(code)) + #for pat in attrs: + # matches = list(tgrep_positions(pat, [tree])) + # record += '\t{}'.format(bool(matches[0])) + + # check attributes by enumerating patterns in code + code_pats = [pat for pat, nodes in get_patterns(code)] + for pat in attrs: + record += '\t{}'.format('T' if pat in code_pats else 'F') + + print(record, file=f) -- cgit v1.2.1