diff options
-rw-r--r-- | monkey/patterns.py | 365 |
1 files changed, 39 insertions, 326 deletions
diff --git a/monkey/patterns.py b/monkey/patterns.py index 48f9163..90709b9 100644 --- a/monkey/patterns.py +++ b/monkey/patterns.py @@ -1,5 +1,5 @@ # CodeQ: an online programming tutor. -# Copyright (C) 2015 UL FRI +# Copyright (C) 2016,2017 UL FRI # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free @@ -18,223 +18,16 @@ import collections from itertools import combinations import pickle import random -import re import sys from nltk import ParentedTree, Tree -from nltk.tgrep import tgrep_positions -from prolog.util import Token, parse as prolog_parse - -# construct pattern to match the structure of nodes given by [include] -def pattern(node, include): - if isinstance(node, Token): - if node.type == 'NAME': - return '"{}"'.format(node.val) - return None - - if any(n is node for n in include): - if node.label() == 'variable': - return 'variable <: "{}"'.format(node[0].val) - return None - - label = node.label() - subpats = [pattern(child, include) for child in node] - - if label == 'functor': - return '{} <1 ({})'.format(label, subpats[0]) - - pat = None - if any(subpats): - if label == 'and': - if subpats[1]: - pat = subpats[1] - if subpats[0]: - if pat: - pat = subpats[0] + ' .. (' + pat + ')' - else: - pat = subpats[0] - elif label == 'args': - pat = label - for i, subpat in enumerate(subpats): - if subpat: - pat += ' <{} ({})'.format(i+1, subpat) - elif label == 'binop': - pat = label - if subpats[0]: - pat += ' <1 ({})'.format(subpats[0]) - pat += ' <2 ("{}")'.format(node[1].val) - if subpats[2]: - pat += ' <3 ({})'.format(subpats[2]) - elif label == 'clause': - pat = label - for i, subpat in enumerate(subpats): - if subpat: - pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i]) - elif label == 'compound': - if len(subpats) > 1 and subpats[1]: - pat = label - for i, subpat in enumerate(subpats): - pat += ' <{} ({})'.format(i+1, subpat) - else: - return None - elif label == 'head': - pat = label - pat += ' <1 ({})'.format(subpats[0]) - if not pat: - for s in subpats: - if s: - pat = s - break - return pat - -# construct a pattern connecting (variable) node [a] to [b] -def connect(a, b): - path_a = [] - node = a - while node.parent(): - node = node.parent() - path_a.insert(0, node) - - path_b = [] - node = b - while node.parent(): - node = node.parent() - path_b.insert(0, node) - - common_ancestor = None - for i in range(min(len(path_a), len(path_b))): - if path_a[i] is not path_b[i]: - break - if path_a[i].label() in {'compound', 'head'}: - break - common_ancestor = path_a[i] - - def node_label(node): - if node.label() == 'compound': - right = node[1] - nargs = 1 - while right._label == "args" and len(right) == 2: - right = right[1] - nargs += 1 - return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs) - if node.label() == 'binop': - return 'binop <2 "{}"'.format(node[1].val) - return node.label() - i = 0 - while path_a[i].label() != 'clause': - i += 1 - - # path from top to common ancestor - pat = path_a[i].label() - i += 1 - n_top = 0 - while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]: - node = path_a[i] - i += 1 - if node.label() == 'and': - continue - if node.parent().label() == 'and': - op = '<+(and)' - else: - op = '<{}'.format(node.parent_index()+1) - pat += ' {} ({}'.format(op, node_label(node)) - n_top += 1 - - path_a = path_a[i:] - path_b = path_b[i:] - - # path from common ancestor to [a] - n_a = 0 - for node in path_a: - if node.label() == 'and': - continue - op = '<' - if node.parent().label() == 'and': - op = '<+(and)' - elif node.parent_index() is not None: - op = '<{}'.format(node.parent_index()+1) - pat += ' {} ({}'.format(op, node_label(node)) - n_a += 1 - pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val) - pat += ')' * n_a - - # path from common ancestor to [b] - n_b = 0 - for node in path_b: - if node.label() == 'and': - continue - op = '<' - if node.parent().label() == 'and': - op = '<+(and)' - elif node.parent_index() is not None: - op = '<{}'.format(node.parent_index()+1) - pat += ' {} ({}'.format(op, node_label(node)) - n_b += 1 - pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val) - - pat += ')' * (n_top + n_b) - return pat - -# replace variable names with patterns and backlinks -def postprocess(pattern): - macros = '@ VAR /[A-Z]/; ' - #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern) - m = re.search(r'"[A-Z]_[0-9]+"', pattern) - nvars = 0 - while m is not None: - orig_name = m.group(0) - n = orig_name.strip('"').split("_")[1] - pat_name = 'v{}'.format(nvars) - nvars += 1 - pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():] - for m in re.finditer(orig_name, pattern): - pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():] - m = re.search(r'"([A-Z]*_[0-9]+)"', pattern) - return macros + pattern - -def postprocess_simple(pattern): - pattern = postprocess(pattern) - if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"): - return None - #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern) - relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern) - - # elements - elements = [] - current = "" - i = 0 - par = 0 - while i < len(pattern): - if par > 0 and pattern[i] == ")": - par -= 1 - if par == 0: - elements.append(current) - current = "" - if par > 0 and pattern[i] == "(": - par += 1 - if par == 0 and \ - (pattern[i:].startswith("(head") or - pattern[i:].startswith("(compound") or - pattern[i:].startswith("(binop")): - par = 1 - if par > 0: - current += pattern[i] - i += 1 - # simplify variable - for ei, e in enumerate(elements): - # #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e) - elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")" - elements = sorted(elements) - #print(pattern) - #print(relevant) - #return "_".join(relevant)#pattern - return " ".join(elements)#pattern +from prolog.util import parse as prolog_parse # construct pattern to match the structure of nodes given by [include], # supports variables and literals -def pattern2(node, include): - if isinstance(node, Token): +def pattern(node, include): + if not isinstance(node, Tree): return None label = node.label() @@ -247,7 +40,7 @@ def pattern2(node, include): if label == 'functor': return '({} "{}")'.format(label, node[0].val) - subpats = [pattern2(child, include) for child in node] + subpats = [pattern(child, include) for child in node] pat = None if any(subpats): if label == 'and': @@ -309,7 +102,6 @@ def pattern2(node, include): return pat def get_patterns(tree): - orig = tree if isinstance(tree, str): tree = prolog_parse(tree) if tree is None: @@ -318,14 +110,12 @@ def get_patterns(tree): # get patterns separately for each clause for clause in tree: - all_variables = [] variables = collections.defaultdict(list) def walk(node): if isinstance(node, Tree): if node.label() == 'variable': name = node[0].val variables[name].append(node) - all_variables.append(node) else: for child in node: walk(child) @@ -334,26 +124,16 @@ def get_patterns(tree): # connecting pairs of nodes with same variable for var, nodes in variables.items(): for selected in combinations(nodes, 2): - pat = pattern2(clause, selected) + pat = pattern(clause, selected) if pat: yield pat, selected - #pat = connect(*selected) - #if pat: - # pp = postprocess_simple(pat) - # if pp: - # yield pp, selected # add singletons for var, nodes in variables.items(): if len(nodes) == 1: - pat = pattern2(clause, nodes) + pat = pattern(clause, nodes) if pat: yield pat, nodes - #pat = pattern(tree, var) - #if pat: - # pp = postprocess_simple(pat) - # if pp: - # yield pp, nodes # add patterns for literal/variable pairs literals = [node for node in clause.subtrees() if node.label() == 'literal'] @@ -363,116 +143,56 @@ def get_patterns(tree): top = top.parent() variables = [node for node in top.subtrees() if node.label() == 'variable'] for var in variables: - pat = pattern2(clause, [var, literal]) + pat = pattern(clause, [var, literal]) if pat: yield pat, [var, literal] -# # connecting pairs of nodes with variables -# for selected in combinations(all_variables, 2): -# pat = connect(selected[0], selected[1]) -# if pat: -# yield postprocess(pat), selected - -# # using pairs of nodes with variables -# for selected in combinations(all_variables, 2): -# pat = pattern(tree, selected) -# if pat: -# yield postprocess(pat), selected - -# # using pairs of nodes with same variable -# for var, nodes in variables.items(): -# for selected in combinations(nodes, 2): -# pat = pattern(tree, selected) -# if pat: -# yield postprocess(pat), selected - -# # using each variable separately -# for var, nodes in variables.items(): -# pat = pattern(tree, nodes) -# if pat: -# yield postprocess(pat), nodes - -# # using each goal to select variables FIXME -# goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}] -# for goal in goals: -# goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'} -# pat = pattern(tree, goal_vars) -# if pat: -# yield postprocess(pat) - -# nltk.tgrep does not play nice with non-Tree leaves -def _tgrep_prepare(tree): - if not isinstance(tree, ParentedTree): - tree = ParentedTree.convert(tree) - def prepare(node): - if isinstance(node, Token) or isinstance(node, str): - return ParentedTree(str(node), []) - return ParentedTree(node.label(), [prepare(child) for child in node]) - return prepare(tree) - -def find_motif(tree, motif): - tree = _tgrep_prepare(tree) - return tgrep_positions(motif, [tree]) - # Extract edits and other data from existing traces for each problem. # Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle> if __name__ == '__main__': - # Ignore traces from these users. - ignored_users = [ - 1, # admin - 231, # test - 360, # test2 - 358, # sasha - ] - pid = int(sys.argv[1]) name = sys.argv[2] submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb')) - print('Analyzing programs for problem {}…'.format(pid)) - ndata = { - 'train': [], - 'test': [] - } - # get all users - users = set() - for code, data in submissions.items(): - users |= data['users'] - users = list(users) - users.sort() + # find test/train users + users = sorted({user for code, info in submissions.items() for user in info['users']}) random.Random(0).shuffle(users) split = int(len(users)*0.7) learn_users = set(users[:split]) test_users = set(users[split:]) - fusers = open('data/users-test-{}.txt'.format(pid), 'wt') - for tu in test_users: - fusers.write('{}\n'.format(tu)) - for code, data in submissions.items(): + # save test users to file + with open('data/users-test-{}.txt'.format(pid), 'wt') as f: + for user in test_users: + print(user, file=f) + + # find test/train programs + data = { + 'train': [], + 'test': [] + } + for code, info in submissions.items(): if len(code) > 1000 or prolog_parse(code) is None: continue if name not in code: continue - ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users) - ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users) - #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users']) + data['train'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & learn_users) + data['test'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & test_users) + + # print info about test users and test/train programs + print('Test users:') + print(test_users) + print() + for which in ['train', 'test']: + print('Programs ({}):'.format(which)) + print('correct: {} ({} unique)'.format( + len([code for code, correct in data[which] if correct]), + len({code for code, correct in data[which] if correct}))) + print('incorrect: {} ({} unique)'.format( + len([code for code, correct in data[which] if not correct]), + len({code for code, correct in data[which] if not correct}))) + print() - print('correct: {} ({} unique)'.format( - len([code for code, correct in ndata['train'] if correct]), - len({code for code, correct in ndata['train'] if correct}))) - print('incorrect: {} ({} unique)'.format( - len([code for code, correct in ndata['train'] if not correct]), - len({code for code, correct in ndata['train'] if not correct}))) - - #iprograms.sort() - #random.Random(0).shuffle(programs) - #split = int(len(programs)*0.7) - #data = { - # 'train': programs[:split], - # 'test': programs[split:], - #} - data = ndata - # extract attributes from training data patterns = collections.Counter() for code, correct in data['train']: @@ -494,18 +214,11 @@ if __name__ == '__main__': print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f) print('\t'.join(['d'] * (len(attrs)+2)), file=f) print('meta\tclass', file=f) + + # print rows (program, correct, attr1, attr2, …) for code, correct in data[t]: record = '{}\t{}'.format(repr(code), 'T' if correct else 'F') - - ## check attributes by using tgrep to find patterns - #tree = _tgrep_prepare(prolog_parse(code)) - #for pat in attrs: - # matches = list(tgrep_positions(pat, [tree])) - # record += '\t{}'.format(bool(matches[0])) - - # check attributes by enumerating patterns in code code_pats = [pat for pat, nodes in get_patterns(code)] for pat in attrs: record += '\t{}'.format('T' if pat in code_pats else 'F') - print(record, file=f) |