# CodeQ: an online programming tutor. # Copyright (C) 2015 UL FRI # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more # details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import collections from itertools import combinations import pickle import random import re import sys from nltk import ParentedTree, Tree from nltk.tgrep import tgrep_positions from prolog.util import Token, parse as prolog_parse # construct pattern to match the structure of nodes given by [include] def pattern(node, include): if isinstance(node, Token): if node.type == 'NAME': return '"{}"'.format(node.val) return None if any(n is node for n in include): if node.label() == 'variable': return 'variable <: "{}"'.format(node[0].val) return None label = node.label() subpats = [pattern(child, include) for child in node] if label == 'functor': return '{} <1 ({})'.format(label, subpats[0]) pat = None if any(subpats): if label == 'and': if subpats[1]: pat = subpats[1] if subpats[0]: if pat: pat = subpats[0] + ' .. (' + pat + ')' else: pat = subpats[0] elif label == 'args': pat = label for i, subpat in enumerate(subpats): if subpat: pat += ' <{} ({})'.format(i+1, subpat) elif label == 'binop': pat = label if subpats[0]: pat += ' <1 ({})'.format(subpats[0]) pat += ' <2 ("{}")'.format(node[1].val) if subpats[2]: pat += ' <3 ({})'.format(subpats[2]) elif label == 'clause': pat = label for i, subpat in enumerate(subpats): if subpat: pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i]) elif label == 'compound': if len(subpats) > 1 and subpats[1]: pat = label for i, subpat in enumerate(subpats): pat += ' <{} ({})'.format(i+1, subpat) else: return None elif label == 'head': pat = label pat += ' <1 ({})'.format(subpats[0]) if not pat: for s in subpats: if s: pat = s break return pat # construct a pattern connecting (variable) node [a] to [b] def connect(a, b): path_a = [] node = a while node.parent(): node = node.parent() path_a.insert(0, node) path_b = [] node = b while node.parent(): node = node.parent() path_b.insert(0, node) common_ancestor = None for i in range(min(len(path_a), len(path_b))): if path_a[i] is not path_b[i]: break if path_a[i].label() in {'compound', 'head'}: break common_ancestor = path_a[i] def node_label(node): if node.label() == 'compound': right = node[1] nargs = 1 while right._label == "args" and len(right) == 2: right = right[1] nargs += 1 return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs) if node.label() == 'binop': return 'binop <2 "{}"'.format(node[1].val) return node.label() i = 0 while path_a[i].label() != 'clause': i += 1 # path from top to common ancestor pat = path_a[i].label() i += 1 n_top = 0 while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]: node = path_a[i] i += 1 if node.label() == 'and': continue if node.parent().label() == 'and': op = '<+(and)' else: op = '<{}'.format(node.parent_index()+1) pat += ' {} ({}'.format(op, node_label(node)) n_top += 1 path_a = path_a[i:] path_b = path_b[i:] # path from common ancestor to [a] n_a = 0 for node in path_a: if node.label() == 'and': continue op = '<' if node.parent().label() == 'and': op = '<+(and)' elif node.parent_index() is not None: op = '<{}'.format(node.parent_index()+1) pat += ' {} ({}'.format(op, node_label(node)) n_a += 1 pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val) pat += ')' * n_a # path from common ancestor to [b] n_b = 0 for node in path_b: if node.label() == 'and': continue op = '<' if node.parent().label() == 'and': op = '<+(and)' elif node.parent_index() is not None: op = '<{}'.format(node.parent_index()+1) pat += ' {} ({}'.format(op, node_label(node)) n_b += 1 pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val) pat += ')' * (n_top + n_b) return pat # replace variable names with patterns and backlinks def postprocess(pattern): macros = '@ VAR /[A-Z]/; ' #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern) m = re.search(r'"[A-Z]_[0-9]+"', pattern) nvars = 0 while m is not None: orig_name = m.group(0) n = orig_name.strip('"').split("_")[1] pat_name = 'v{}'.format(nvars) nvars += 1 pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():] for m in re.finditer(orig_name, pattern): pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():] m = re.search(r'"([A-Z]*_[0-9]+)"', pattern) return macros + pattern def postprocess_simple(pattern): pattern = postprocess(pattern) if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"): return None #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern) relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern) # elements elements = [] current = "" i = 0 par = 0 while i < len(pattern): if par > 0 and pattern[i] == ")": par -= 1 if par == 0: elements.append(current) current = "" if par > 0 and pattern[i] == "(": par += 1 if par == 0 and \ (pattern[i:].startswith("(head") or pattern[i:].startswith("(compound") or pattern[i:].startswith("(binop")): par = 1 if par > 0: current += pattern[i] i += 1 # simplify variable for ei, e in enumerate(elements): # #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e) elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")" elements = sorted(elements) #print(pattern) #print(relevant) #return "_".join(relevant)#pattern return " ".join(elements)#pattern # construct pattern to match the structure of nodes given by [include], # supports variables and literals def pattern2(node, include): if isinstance(node, Token): return None label = node.label() if any(n is node for n in include): if label == 'literal': return '"{}"'.format(node[0].val) if label == 'variable': return '{}'.format(label) return None if label == 'functor': return '({} "{}")'.format(label, node[0].val) subpats = [pattern2(child, include) for child in node] pat = None if any(subpats): if label == 'and': if subpats[1]: pat = subpats[1] if subpats[0]: if pat: pat = subpats[0] + ' ' + pat else: pat = subpats[0] elif label == 'args': pat = label for i, subpat in enumerate(subpats): if subpat: pat += ' {}'.format(subpat) pat = '(' + pat + ')' elif label == 'unop': pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')' elif label == 'binop': pat = label if subpats[0]: pat += ' {}'.format(subpats[0]) pat += ' "{}"'.format(node[1].val) if subpats[2]: pat += ' {}'.format(subpats[2]) pat = '(' + pat + ')' elif label == 'clause': pat = label for i, subpat in enumerate(subpats): if subpat: pat += ' {}'.format(subpats[i]) return '(' + pat + ')' elif label == 'compound': if len(subpats) > 1 and subpats[1]: pat = label for i, subpat in enumerate(subpats): pat += ' {}'.format(subpat) pat = '(' + pat + ')' else: return None elif label == 'head': pat = label pat += ' {}'.format(subpats[0]) pat = '(' + pat + ')' elif label == 'list': pat = 'list ' if subpats[0]: pat += '(h {})'.format(subpats[0]) if subpats[0] and subpats[1]: pat += ' ' if subpats[1]: pat += '(t {})'.format(subpats[1]) pat = '(' + pat + ')' if not pat: for s in subpats: if s: pat = s break return pat def get_patterns(tree): orig = tree if isinstance(tree, str): tree = prolog_parse(tree) if tree is None: return tree = ParentedTree.convert(tree) # get patterns separately for each clause for clause in tree: all_variables = [] variables = collections.defaultdict(list) def walk(node): if isinstance(node, Tree): if node.label() == 'variable': name = node[0].val variables[name].append(node) all_variables.append(node) else: for child in node: walk(child) walk(clause) # connecting pairs of nodes with same variable for var, nodes in variables.items(): for selected in combinations(nodes, 2): pat = pattern2(clause, selected) if pat: print(pat) yield pat, selected #pat = connect(*selected) #if pat: # pp = postprocess_simple(pat) # if pp: # yield pp, selected # add singletons for var, nodes in variables.items(): if len(nodes) == 1: pat = pattern2(clause, nodes) if pat: print(pat) yield pat, nodes #pat = pattern(tree, var) #if pat: # pp = postprocess_simple(pat) # if pp: # yield pp, nodes # add patterns for literal/variable pairs literals = [node for node in clause.subtrees() if node.label() == 'literal'] for literal in literals: top = literal while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}: top = top.parent() variables = [node for node in top.subtrees() if node.label() == 'variable'] for var in variables: pat = pattern2(clause, [var, literal]) if pat: yield pat, [var, literal] # # connecting pairs of nodes with variables # for selected in combinations(all_variables, 2): # pat = connect(selected[0], selected[1]) # if pat: # yield postprocess(pat), selected # # using pairs of nodes with variables # for selected in combinations(all_variables, 2): # pat = pattern(tree, selected) # if pat: # yield postprocess(pat), selected # # using pairs of nodes with same variable # for var, nodes in variables.items(): # for selected in combinations(nodes, 2): # pat = pattern(tree, selected) # if pat: # yield postprocess(pat), selected # # using each variable separately # for var, nodes in variables.items(): # pat = pattern(tree, nodes) # if pat: # yield postprocess(pat), nodes # # using each goal to select variables FIXME # goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}] # for goal in goals: # goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'} # pat = pattern(tree, goal_vars) # if pat: # yield postprocess(pat) # nltk.tgrep does not play nice with non-Tree leaves def _tgrep_prepare(tree): if not isinstance(tree, ParentedTree): tree = ParentedTree.convert(tree) def prepare(node): if isinstance(node, Token) or isinstance(node, str): return ParentedTree(str(node), []) return ParentedTree(node.label(), [prepare(child) for child in node]) return prepare(tree) def find_motif(tree, motif): tree = _tgrep_prepare(tree) return tgrep_positions(motif, [tree]) # Extract edits and other data from existing traces for each problem. # Run with: python3 -m monkey.patterns if __name__ == '__main__': # Ignore traces from these users. ignored_users = [ 1, # admin 231, # test 360, # test2 358, # sasha ] pid = int(sys.argv[1]) name = sys.argv[2] submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb')) print('Analyzing programs for problem {}…'.format(pid)) ndata = { 'train': [], 'test': [] } # get all users users = set() for code, data in submissions.items(): users |= data['users'] users = list(users) users.sort() random.Random(0).shuffle(users) split = int(len(users)*0.7) learn_users = set(users[:split]) test_users = set(users[split:]) print(test_users) for code, data in submissions.items(): if len(code) > 1000 or prolog_parse(code) is None: continue if name not in code: continue ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users) ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users) #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users']) print('correct: {} ({} unique)'.format( len([code for code, correct in ndata['train'] if correct]), len({code for code, correct in ndata['train'] if correct}))) print('incorrect: {} ({} unique)'.format( len([code for code, correct in ndata['train'] if not correct]), len({code for code, correct in ndata['train'] if not correct}))) #iprograms.sort() #random.Random(0).shuffle(programs) #split = int(len(programs)*0.7) #data = { # 'train': programs[:split], # 'test': programs[split:], #} data = ndata # extract attributes from training data patterns = collections.Counter() for code, correct in data['train']: for pat, nodes in get_patterns(code): patterns[pat] += 1 attrs = [] with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file: for i, (pat, count) in enumerate(patterns.most_common()): if count < 1: break attrs.append(pat) print('a{}\t{}'.format(i, pat), file=pattern_file) # check and write attributes for training/test data for t in ['train', 'test']: with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f: # print header print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f) print('\t'.join(['d'] * (len(attrs)+2)), file=f) print('meta\tclass', file=f) for code, correct in data[t]: record = '{}\t{}'.format(repr(code), 'T' if correct else 'F') ## check attributes by using tgrep to find patterns #tree = _tgrep_prepare(prolog_parse(code)) #for pat in attrs: # matches = list(tgrep_positions(pat, [tree])) # record += '\t{}'.format(bool(matches[0])) # check attributes by enumerating patterns in code code_pats = [pat for pat, nodes in get_patterns(code)] for pat in attrs: record += '\t{}'.format('T' if pat in code_pats else 'F') print(record, file=f)