summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--monkey/patterns.py365
1 files changed, 39 insertions, 326 deletions
diff --git a/monkey/patterns.py b/monkey/patterns.py
index 48f9163..90709b9 100644
--- a/monkey/patterns.py
+++ b/monkey/patterns.py
@@ -1,5 +1,5 @@
# CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2016,2017 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
@@ -18,223 +18,16 @@ import collections
from itertools import combinations
import pickle
import random
-import re
import sys
from nltk import ParentedTree, Tree
-from nltk.tgrep import tgrep_positions
-from prolog.util import Token, parse as prolog_parse
-
-# construct pattern to match the structure of nodes given by [include]
-def pattern(node, include):
- if isinstance(node, Token):
- if node.type == 'NAME':
- return '"{}"'.format(node.val)
- return None
-
- if any(n is node for n in include):
- if node.label() == 'variable':
- return 'variable <: "{}"'.format(node[0].val)
- return None
-
- label = node.label()
- subpats = [pattern(child, include) for child in node]
-
- if label == 'functor':
- return '{} <1 ({})'.format(label, subpats[0])
-
- pat = None
- if any(subpats):
- if label == 'and':
- if subpats[1]:
- pat = subpats[1]
- if subpats[0]:
- if pat:
- pat = subpats[0] + ' .. (' + pat + ')'
- else:
- pat = subpats[0]
- elif label == 'args':
- pat = label
- for i, subpat in enumerate(subpats):
- if subpat:
- pat += ' <{} ({})'.format(i+1, subpat)
- elif label == 'binop':
- pat = label
- if subpats[0]:
- pat += ' <1 ({})'.format(subpats[0])
- pat += ' <2 ("{}")'.format(node[1].val)
- if subpats[2]:
- pat += ' <3 ({})'.format(subpats[2])
- elif label == 'clause':
- pat = label
- for i, subpat in enumerate(subpats):
- if subpat:
- pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i])
- elif label == 'compound':
- if len(subpats) > 1 and subpats[1]:
- pat = label
- for i, subpat in enumerate(subpats):
- pat += ' <{} ({})'.format(i+1, subpat)
- else:
- return None
- elif label == 'head':
- pat = label
- pat += ' <1 ({})'.format(subpats[0])
- if not pat:
- for s in subpats:
- if s:
- pat = s
- break
- return pat
-
-# construct a pattern connecting (variable) node [a] to [b]
-def connect(a, b):
- path_a = []
- node = a
- while node.parent():
- node = node.parent()
- path_a.insert(0, node)
-
- path_b = []
- node = b
- while node.parent():
- node = node.parent()
- path_b.insert(0, node)
-
- common_ancestor = None
- for i in range(min(len(path_a), len(path_b))):
- if path_a[i] is not path_b[i]:
- break
- if path_a[i].label() in {'compound', 'head'}:
- break
- common_ancestor = path_a[i]
-
- def node_label(node):
- if node.label() == 'compound':
- right = node[1]
- nargs = 1
- while right._label == "args" and len(right) == 2:
- right = right[1]
- nargs += 1
- return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs)
- if node.label() == 'binop':
- return 'binop <2 "{}"'.format(node[1].val)
- return node.label()
- i = 0
- while path_a[i].label() != 'clause':
- i += 1
-
- # path from top to common ancestor
- pat = path_a[i].label()
- i += 1
- n_top = 0
- while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]:
- node = path_a[i]
- i += 1
- if node.label() == 'and':
- continue
- if node.parent().label() == 'and':
- op = '<+(and)'
- else:
- op = '<{}'.format(node.parent_index()+1)
- pat += ' {} ({}'.format(op, node_label(node))
- n_top += 1
-
- path_a = path_a[i:]
- path_b = path_b[i:]
-
- # path from common ancestor to [a]
- n_a = 0
- for node in path_a:
- if node.label() == 'and':
- continue
- op = '<'
- if node.parent().label() == 'and':
- op = '<+(and)'
- elif node.parent_index() is not None:
- op = '<{}'.format(node.parent_index()+1)
- pat += ' {} ({}'.format(op, node_label(node))
- n_a += 1
- pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val)
- pat += ')' * n_a
-
- # path from common ancestor to [b]
- n_b = 0
- for node in path_b:
- if node.label() == 'and':
- continue
- op = '<'
- if node.parent().label() == 'and':
- op = '<+(and)'
- elif node.parent_index() is not None:
- op = '<{}'.format(node.parent_index()+1)
- pat += ' {} ({}'.format(op, node_label(node))
- n_b += 1
- pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val)
-
- pat += ')' * (n_top + n_b)
- return pat
-
-# replace variable names with patterns and backlinks
-def postprocess(pattern):
- macros = '@ VAR /[A-Z]/; '
- #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern)
- m = re.search(r'"[A-Z]_[0-9]+"', pattern)
- nvars = 0
- while m is not None:
- orig_name = m.group(0)
- n = orig_name.strip('"').split("_")[1]
- pat_name = 'v{}'.format(nvars)
- nvars += 1
- pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():]
- for m in re.finditer(orig_name, pattern):
- pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():]
- m = re.search(r'"([A-Z]*_[0-9]+)"', pattern)
- return macros + pattern
-
-def postprocess_simple(pattern):
- pattern = postprocess(pattern)
- if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"):
- return None
- #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern)
- relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern)
-
- # elements
- elements = []
- current = ""
- i = 0
- par = 0
- while i < len(pattern):
- if par > 0 and pattern[i] == ")":
- par -= 1
- if par == 0:
- elements.append(current)
- current = ""
- if par > 0 and pattern[i] == "(":
- par += 1
- if par == 0 and \
- (pattern[i:].startswith("(head") or
- pattern[i:].startswith("(compound") or
- pattern[i:].startswith("(binop")):
- par = 1
- if par > 0:
- current += pattern[i]
- i += 1
- # simplify variable
- for ei, e in enumerate(elements):
- # #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e)
- elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")"
- elements = sorted(elements)
- #print(pattern)
- #print(relevant)
- #return "_".join(relevant)#pattern
- return " ".join(elements)#pattern
+from prolog.util import parse as prolog_parse
# construct pattern to match the structure of nodes given by [include],
# supports variables and literals
-def pattern2(node, include):
- if isinstance(node, Token):
+def pattern(node, include):
+ if not isinstance(node, Tree):
return None
label = node.label()
@@ -247,7 +40,7 @@ def pattern2(node, include):
if label == 'functor':
return '({} "{}")'.format(label, node[0].val)
- subpats = [pattern2(child, include) for child in node]
+ subpats = [pattern(child, include) for child in node]
pat = None
if any(subpats):
if label == 'and':
@@ -309,7 +102,6 @@ def pattern2(node, include):
return pat
def get_patterns(tree):
- orig = tree
if isinstance(tree, str):
tree = prolog_parse(tree)
if tree is None:
@@ -318,14 +110,12 @@ def get_patterns(tree):
# get patterns separately for each clause
for clause in tree:
- all_variables = []
variables = collections.defaultdict(list)
def walk(node):
if isinstance(node, Tree):
if node.label() == 'variable':
name = node[0].val
variables[name].append(node)
- all_variables.append(node)
else:
for child in node:
walk(child)
@@ -334,26 +124,16 @@ def get_patterns(tree):
# connecting pairs of nodes with same variable
for var, nodes in variables.items():
for selected in combinations(nodes, 2):
- pat = pattern2(clause, selected)
+ pat = pattern(clause, selected)
if pat:
yield pat, selected
- #pat = connect(*selected)
- #if pat:
- # pp = postprocess_simple(pat)
- # if pp:
- # yield pp, selected
# add singletons
for var, nodes in variables.items():
if len(nodes) == 1:
- pat = pattern2(clause, nodes)
+ pat = pattern(clause, nodes)
if pat:
yield pat, nodes
- #pat = pattern(tree, var)
- #if pat:
- # pp = postprocess_simple(pat)
- # if pp:
- # yield pp, nodes
# add patterns for literal/variable pairs
literals = [node for node in clause.subtrees() if node.label() == 'literal']
@@ -363,116 +143,56 @@ def get_patterns(tree):
top = top.parent()
variables = [node for node in top.subtrees() if node.label() == 'variable']
for var in variables:
- pat = pattern2(clause, [var, literal])
+ pat = pattern(clause, [var, literal])
if pat:
yield pat, [var, literal]
-# # connecting pairs of nodes with variables
-# for selected in combinations(all_variables, 2):
-# pat = connect(selected[0], selected[1])
-# if pat:
-# yield postprocess(pat), selected
-
-# # using pairs of nodes with variables
-# for selected in combinations(all_variables, 2):
-# pat = pattern(tree, selected)
-# if pat:
-# yield postprocess(pat), selected
-
-# # using pairs of nodes with same variable
-# for var, nodes in variables.items():
-# for selected in combinations(nodes, 2):
-# pat = pattern(tree, selected)
-# if pat:
-# yield postprocess(pat), selected
-
-# # using each variable separately
-# for var, nodes in variables.items():
-# pat = pattern(tree, nodes)
-# if pat:
-# yield postprocess(pat), nodes
-
-# # using each goal to select variables FIXME
-# goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}]
-# for goal in goals:
-# goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'}
-# pat = pattern(tree, goal_vars)
-# if pat:
-# yield postprocess(pat)
-
-# nltk.tgrep does not play nice with non-Tree leaves
-def _tgrep_prepare(tree):
- if not isinstance(tree, ParentedTree):
- tree = ParentedTree.convert(tree)
- def prepare(node):
- if isinstance(node, Token) or isinstance(node, str):
- return ParentedTree(str(node), [])
- return ParentedTree(node.label(), [prepare(child) for child in node])
- return prepare(tree)
-
-def find_motif(tree, motif):
- tree = _tgrep_prepare(tree)
- return tgrep_positions(motif, [tree])
-
# Extract edits and other data from existing traces for each problem.
# Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle>
if __name__ == '__main__':
- # Ignore traces from these users.
- ignored_users = [
- 1, # admin
- 231, # test
- 360, # test2
- 358, # sasha
- ]
-
pid = int(sys.argv[1])
name = sys.argv[2]
submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))
- print('Analyzing programs for problem {}…'.format(pid))
- ndata = {
- 'train': [],
- 'test': []
- }
- # get all users
- users = set()
- for code, data in submissions.items():
- users |= data['users']
- users = list(users)
- users.sort()
+ # find test/train users
+ users = sorted({user for code, info in submissions.items() for user in info['users']})
random.Random(0).shuffle(users)
split = int(len(users)*0.7)
learn_users = set(users[:split])
test_users = set(users[split:])
- fusers = open('data/users-test-{}.txt'.format(pid), 'wt')
- for tu in test_users:
- fusers.write('{}\n'.format(tu))
- for code, data in submissions.items():
+ # save test users to file
+ with open('data/users-test-{}.txt'.format(pid), 'wt') as f:
+ for user in test_users:
+ print(user, file=f)
+
+ # find test/train programs
+ data = {
+ 'train': [],
+ 'test': []
+ }
+ for code, info in submissions.items():
if len(code) > 1000 or prolog_parse(code) is None:
continue
if name not in code:
continue
- ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users)
- ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users)
- #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'])
+ data['train'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & learn_users)
+ data['test'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & test_users)
+
+ # print info about test users and test/train programs
+ print('Test users:')
+ print(test_users)
+ print()
+ for which in ['train', 'test']:
+ print('Programs ({}):'.format(which))
+ print('correct: {} ({} unique)'.format(
+ len([code for code, correct in data[which] if correct]),
+ len({code for code, correct in data[which] if correct})))
+ print('incorrect: {} ({} unique)'.format(
+ len([code for code, correct in data[which] if not correct]),
+ len({code for code, correct in data[which] if not correct})))
+ print()
- print('correct: {} ({} unique)'.format(
- len([code for code, correct in ndata['train'] if correct]),
- len({code for code, correct in ndata['train'] if correct})))
- print('incorrect: {} ({} unique)'.format(
- len([code for code, correct in ndata['train'] if not correct]),
- len({code for code, correct in ndata['train'] if not correct})))
-
- #iprograms.sort()
- #random.Random(0).shuffle(programs)
- #split = int(len(programs)*0.7)
- #data = {
- # 'train': programs[:split],
- # 'test': programs[split:],
- #}
- data = ndata
-
# extract attributes from training data
patterns = collections.Counter()
for code, correct in data['train']:
@@ -494,18 +214,11 @@ if __name__ == '__main__':
print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
print('\t'.join(['d'] * (len(attrs)+2)), file=f)
print('meta\tclass', file=f)
+
+ # print rows (program, correct, attr1, attr2, …)
for code, correct in data[t]:
record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
-
- ## check attributes by using tgrep to find patterns
- #tree = _tgrep_prepare(prolog_parse(code))
- #for pat in attrs:
- # matches = list(tgrep_positions(pat, [tree]))
- # record += '\t{}'.format(bool(matches[0]))
-
- # check attributes by enumerating patterns in code
code_pats = [pat for pat, nodes in get_patterns(code)]
for pat in attrs:
record += '\t{}'.format('T' if pat in code_pats else 'F')
-
print(record, file=f)