1 files changed, 39 insertions, 326 deletions
diff --git a/monkey/patterns.py b/monkey/patterns.py
index 48f9163..90709b9 100644
--- a/monkey/patterns.py
+++ b/monkey/patterns.py
@@ -1,5 +1,5 @@
 # CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2016,2017 UL FRI
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
@@ -18,223 +18,16 @@ import collections
 from itertools import combinations
 import pickle
 import random
-import re
 import sys
 
 from nltk import ParentedTree, Tree
-from nltk.tgrep import tgrep_positions
 
-from prolog.util import Token, parse as prolog_parse
-
-# construct pattern to match the structure of nodes given by [include]
-def pattern(node, include):
-    if isinstance(node, Token):
-        if node.type == 'NAME':
-            return '"{}"'.format(node.val)
-        return None
-
-    if any(n is node for n in include):
-        if node.label() == 'variable':
-            return 'variable <: "{}"'.format(node[0].val)
-        return None
-
-    label = node.label()
-    subpats = [pattern(child, include) for child in node]
-
-    if label == 'functor':
-        return '{} <1 ({})'.format(label, subpats[0])
-
-    pat = None
-    if any(subpats):
-        if label == 'and':
-            if subpats[1]:
-                pat = subpats[1]
-            if subpats[0]:
-                if pat:
-                    pat = subpats[0] + ' .. (' + pat + ')'
-                else:
-                    pat = subpats[0]
-        elif label == 'args':
-            pat = label
-            for i, subpat in enumerate(subpats):
-                if subpat:
-                    pat += ' <{} ({})'.format(i+1, subpat)
-        elif label == 'binop':
-            pat = label
-            if subpats[0]:
-                pat += ' <1 ({})'.format(subpats[0])
-            pat += ' <2 ("{}")'.format(node[1].val)
-            if subpats[2]:
-                pat += ' <3 ({})'.format(subpats[2])
-        elif label == 'clause':
-            pat = label
-            for i, subpat in enumerate(subpats):
-                if subpat:
-                    pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i])
-        elif label == 'compound':
-            if len(subpats) > 1 and subpats[1]:
-                pat = label
-                for i, subpat in enumerate(subpats):
-                    pat += ' <{} ({})'.format(i+1, subpat)
-            else:
-                return None
-        elif label == 'head':
-            pat = label
-            pat += ' <1 ({})'.format(subpats[0])
-        if not pat:
-            for s in subpats:
-                if s:
-                    pat = s
-                    break
-    return pat
-
-# construct a pattern connecting (variable) node [a] to [b]
-def connect(a, b):
-    path_a = []
-    node = a
-    while node.parent():
-        node = node.parent()
-        path_a.insert(0, node)
-
-    path_b = []
-    node = b
-    while node.parent():
-        node = node.parent()
-        path_b.insert(0, node)
-
-    common_ancestor = None
-    for i in range(min(len(path_a), len(path_b))):
-        if path_a[i] is not path_b[i]:
-            break
-        if path_a[i].label() in {'compound', 'head'}:
-            break
-        common_ancestor = path_a[i]
-
-    def node_label(node):
-        if node.label() == 'compound':
-            right = node[1]
-            nargs = 1
-            while right._label == "args" and len(right) == 2:
-                right = right[1]
-                nargs += 1
-            return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs)
-        if node.label() == 'binop':
-            return 'binop <2 "{}"'.format(node[1].val)
-        return node.label()
-    i = 0
-    while path_a[i].label() != 'clause':
-        i += 1
-
-    # path from top to common ancestor
-    pat = path_a[i].label()
-    i += 1
-    n_top = 0
-    while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]:
-        node = path_a[i]
-        i += 1
-        if node.label() == 'and':
-            continue
-        if node.parent().label() == 'and':
-            op = '<+(and)'
-        else:
-            op = '<{}'.format(node.parent_index()+1)
-        pat += ' {} ({}'.format(op, node_label(node))
-        n_top += 1
-
-    path_a = path_a[i:]
-    path_b = path_b[i:]
-
-    # path from common ancestor to [a]
-    n_a = 0
-    for node in path_a:
-        if node.label() == 'and':
-            continue
-        op = '<'
-        if node.parent().label() == 'and':
-            op = '<+(and)'
-        elif node.parent_index() is not None:
-            op = '<{}'.format(node.parent_index()+1)
-        pat += ' {} ({}'.format(op, node_label(node))
-        n_a += 1
-    pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val)
-    pat += ')' * n_a
-
-    # path from common ancestor to [b]
-    n_b = 0
-    for node in path_b:
-        if node.label() == 'and':
-            continue
-        op = '<'
-        if node.parent().label() == 'and':
-            op = '<+(and)'
-        elif node.parent_index() is not None:
-            op = '<{}'.format(node.parent_index()+1)
-        pat += ' {} ({}'.format(op, node_label(node))
-        n_b += 1
-    pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val)
-
-    pat += ')' * (n_top + n_b)
-    return pat
-
-# replace variable names with patterns and backlinks
-def postprocess(pattern):
-    macros = '@ VAR /[A-Z]/; '
-    #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern)
-    m = re.search(r'"[A-Z]_[0-9]+"', pattern)
-    nvars = 0
-    while m is not None:
-        orig_name = m.group(0)
-        n = orig_name.strip('"').split("_")[1]
-        pat_name = 'v{}'.format(nvars)
-        nvars += 1
-        pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():]
-        for m in re.finditer(orig_name, pattern):
-            pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():]
-        m = re.search(r'"([A-Z]*_[0-9]+)"', pattern)
-    return macros + pattern
-
-def postprocess_simple(pattern):
-    pattern = postprocess(pattern)
-    if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"):
-        return None
-    #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern)
-    relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern)
-    
-    # elements
-    elements = []
-    current = ""
-    i = 0
-    par = 0
-    while i < len(pattern):
-        if par > 0 and pattern[i] == ")":
-            par -= 1
-            if par == 0:
-                elements.append(current)
-                current = ""
-        if par > 0 and pattern[i] == "(":
-            par += 1
-        if par == 0 and \
-                (pattern[i:].startswith("(head") or 
-                 pattern[i:].startswith("(compound") or
-                 pattern[i:].startswith("(binop")):
-            par = 1
-        if par > 0:
-            current += pattern[i]
-        i += 1
-    # simplify variable
-    for ei, e in enumerate(elements):
-    #    #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e)
-        elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")"
-    elements = sorted(elements)
-    #print(pattern)
-    #print(relevant)
-    #return "_".join(relevant)#pattern
-    return " ".join(elements)#pattern
+from prolog.util import parse as prolog_parse
 
 # construct pattern to match the structure of nodes given by [include],
 # supports variables and literals
-def pattern2(node, include):
-    if isinstance(node, Token):
+def pattern(node, include):
+    if not isinstance(node, Tree):
         return None
 
     label = node.label()
@@ -247,7 +40,7 @@ def pattern2(node, include):
     if label == 'functor':
         return '({} "{}")'.format(label, node[0].val)
 
-    subpats = [pattern2(child, include) for child in node]
+    subpats = [pattern(child, include) for child in node]
     pat = None
     if any(subpats):
         if label == 'and':
@@ -309,7 +102,6 @@ def pattern2(node, include):
     return pat
 
 def get_patterns(tree):
-    orig = tree
     if isinstance(tree, str):
         tree = prolog_parse(tree)
         if tree is None:
@@ -318,14 +110,12 @@ def get_patterns(tree):
 
     # get patterns separately for each clause
     for clause in tree:
-        all_variables = []
         variables = collections.defaultdict(list)
         def walk(node):
             if isinstance(node, Tree):
                 if node.label() == 'variable':
                     name = node[0].val
                     variables[name].append(node)
-                    all_variables.append(node)
                 else:
                     for child in node:
                         walk(child)
@@ -334,26 +124,16 @@ def get_patterns(tree):
         # connecting pairs of nodes with same variable
         for var, nodes in variables.items():
             for selected in combinations(nodes, 2):
-                pat = pattern2(clause, selected)
+                pat = pattern(clause, selected)
                 if pat:
                     yield pat, selected
-                #pat = connect(*selected)
-                #if pat:
-                #    pp = postprocess_simple(pat)
-                #    if pp:
-                #        yield pp, selected
 
         # add singletons
         for var, nodes in variables.items():
             if len(nodes) == 1: 
-                pat = pattern2(clause, nodes)
+                pat = pattern(clause, nodes)
                 if pat:
                     yield pat, nodes
-                #pat = pattern(tree, var)
-                #if pat:
-                #    pp = postprocess_simple(pat)
-                #    if pp:
-                #        yield pp, nodes
         
         # add patterns for literal/variable pairs
         literals = [node for node in clause.subtrees() if node.label() == 'literal']
@@ -363,116 +143,56 @@ def get_patterns(tree):
                 top = top.parent()
             variables = [node for node in top.subtrees() if node.label() == 'variable']
             for var in variables:
-                pat = pattern2(clause, [var, literal])
+                pat = pattern(clause, [var, literal])
                 if pat:
                     yield pat, [var, literal]
 
-#        # connecting pairs of nodes with variables
-#        for selected in combinations(all_variables, 2):
-#            pat = connect(selected[0], selected[1])
-#            if pat:
-#                yield postprocess(pat), selected
-
-#        # using pairs of nodes with variables
-#        for selected in combinations(all_variables, 2):
-#            pat = pattern(tree, selected)
-#            if pat:
-#                yield postprocess(pat), selected
-
-#        # using pairs of nodes with same variable
-#        for var, nodes in variables.items():
-#            for selected in combinations(nodes, 2):
-#                pat = pattern(tree, selected)
-#                if pat:
-#                    yield postprocess(pat), selected
-
-#       # using each variable separately
-#       for var, nodes in variables.items():
-#           pat = pattern(tree, nodes)
-#           if pat:
-#               yield postprocess(pat), nodes
-
-#       # using each goal to select variables FIXME
-#       goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}]
-#       for goal in goals:
-#           goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'}
-#           pat = pattern(tree, goal_vars)
-#           if pat:
-#               yield postprocess(pat)
-
-# nltk.tgrep does not play nice with non-Tree leaves
-def _tgrep_prepare(tree):
-    if not isinstance(tree, ParentedTree):
-        tree = ParentedTree.convert(tree)
-    def prepare(node):
-        if isinstance(node, Token) or isinstance(node, str):
-            return ParentedTree(str(node), [])
-        return ParentedTree(node.label(), [prepare(child) for child in node])
-    return prepare(tree)
-
-def find_motif(tree, motif):
-    tree = _tgrep_prepare(tree)
-    return tgrep_positions(motif, [tree])
-
 # Extract edits and other data from existing traces for each problem.
 # Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle>
 if __name__ == '__main__':
-    # Ignore traces from these users.
-    ignored_users = [
-        1, # admin
-        231, # test
-        360, # test2
-        358, # sasha
-    ]
-
     pid = int(sys.argv[1])
     name = sys.argv[2]
     submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))
 
-    print('Analyzing programs for problem {}…'.format(pid))
-    ndata = {
-        'train': [],
-        'test': []
-    }
-    # get all users
-    users = set()
-    for code, data in submissions.items():
-        users |= data['users']
-    users = list(users)
-    users.sort()
+    # find test/train users
+    users = sorted({user for code, info in submissions.items() for user in info['users']})
     random.Random(0).shuffle(users)
     split = int(len(users)*0.7)
     learn_users = set(users[:split])
     test_users = set(users[split:])
-    fusers = open('data/users-test-{}.txt'.format(pid), 'wt')
-    for tu in test_users:
-        fusers.write('{}\n'.format(tu))
 
-    for code, data in submissions.items():
+    # save test users to file
+    with open('data/users-test-{}.txt'.format(pid), 'wt') as f:
+        for user in test_users:
+            print(user, file=f)
+
+    # find test/train programs
+    data = {
+        'train': [],
+        'test': []
+    }
+    for code, info in submissions.items():
         if len(code) > 1000 or prolog_parse(code) is None:
             continue
         if name not in code:
             continue
-        ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users)
-        ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users)
-        #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'])
+        data['train'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & learn_users)
+        data['test'] += [(code, info['n_tests'] == info['n_passed'])] * len(info['users'] & test_users)
+
+    # print info about test users and test/train programs
+    print('Test users:')
+    print(test_users)
+    print()
+    for which in ['train', 'test']:
+        print('Programs ({}):'.format(which))
+        print('correct: {} ({} unique)'.format(
+            len([code for code, correct in data[which] if correct]),
+            len({code for code, correct in data[which] if correct})))
+        print('incorrect: {} ({} unique)'.format(
+            len([code for code, correct in data[which] if not correct]),
+            len({code for code, correct in data[which] if not correct})))
+        print()
 
-    print('correct: {} ({} unique)'.format(
-        len([code for code, correct in ndata['train'] if correct]),
-        len({code for code, correct in ndata['train'] if correct})))
-    print('incorrect: {} ({} unique)'.format(
-        len([code for code, correct in ndata['train'] if not correct]),
-        len({code for code, correct in ndata['train'] if not correct})))
-
-    #iprograms.sort()
-    #random.Random(0).shuffle(programs)
-    #split = int(len(programs)*0.7)
-    #data = {
-    #    'train': programs[:split],
-    #    'test': programs[split:],
-    #}
-    data = ndata
-    
     # extract attributes from training data
     patterns = collections.Counter()
     for code, correct in data['train']:
@@ -494,18 +214,11 @@ if __name__ == '__main__':
             print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
             print('\t'.join(['d'] * (len(attrs)+2)), file=f)
             print('meta\tclass', file=f)
+
+            # print rows (program, correct, attr1, attr2, …)
             for code, correct in data[t]:
                 record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
-
-                ## check attributes by using tgrep to find patterns
-                #tree = _tgrep_prepare(prolog_parse(code))
-                #for pat in attrs:
-                #    matches = list(tgrep_positions(pat, [tree]))
-                #    record += '\t{}'.format(bool(matches[0]))
-
-                # check attributes by enumerating patterns in code
                 code_pats = [pat for pat, nodes in get_patterns(code)]
                 for pat in attrs:
                     record += '\t{}'.format('T' if pat in code_pats else 'F')
-
                 print(record, file=f)