From 27d4458613a5b61f16ad9bf59ca1de460fea3b3a Mon Sep 17 00:00:00 2001
From: Timotej Lazar <timotej.lazar@fri.uni-lj.si>
Date: Mon, 9 Jan 2017 18:07:23 +0100
Subject: First commit is the best commit

---
 monkey/__init__.py |   0
 monkey/action.py   | 239 +++++++++++++++++++++++++
 monkey/patterns.py | 511 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 750 insertions(+)
 create mode 100644 monkey/__init__.py
 create mode 100644 monkey/action.py
 create mode 100644 monkey/patterns.py

(limited to 'monkey')

diff --git a/monkey/__init__.py b/monkey/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/monkey/action.py b/monkey/action.py
new file mode 100644
index 0000000..c2f73ad
--- /dev/null
+++ b/monkey/action.py
@@ -0,0 +1,239 @@
+#!/usr/bin/python3
+
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+
+class Action:
+    def __init__(self, abstime, data):
+        self.type = data['typ']
+        self.time = abstime   # time from start
+
+        # generic actions
+        if self.type == 'open':
+            self.timestamp = data['time']
+        elif self.type == 'ins':
+            self.type = 'insert'
+            self.offset = data['off']
+            self.text = data['txt']
+            self.length = len(self.text)
+        elif self.type == 'rm':
+            self.type = 'remove'
+            self.offset = data['off']
+            self.text = data['txt']
+            self.length = len(self.text)
+        elif self.type == 'test':
+            # data['feedback'] is a 'test_results' hint object, or a list
+            # containing such hint
+            hint = None
+            if isinstance(data['feedback'], list):
+                for hint in data['feedback']:
+                    if hint['id'] == 'test_results':
+                        break
+            else:
+                hint = data['feedback']
+            if hint is not None:
+                self.n_correct = hint['args']['passed']
+                self.n_all = hint['args']['total']
+            else:
+                self.n_correct = self.n_all = None
+        elif self.type == 'hint':
+            self.feedback = data['feedback']
+        elif self.type == 'hnt':
+            # obsolete Prolog hint action, with no additional info
+            self.type = 'hint'
+            self.feedback = None
+        elif self.type == 'plan':
+            self.index = data.get('index')
+
+        # Prolog actions
+        elif self.type == 'prolog_solve':
+            self.query = data['query']
+        elif self.type == 'slva':
+            # obsolete Prolog "solve all" action
+            self.type = 'prolog_solve'
+            self.query = data['qry']
+        elif self.type == 'experiment':
+            self.data = data['data']
+
+        # Python actions
+        elif self.type == 'python_input':
+            self.text = data['txt']
+        elif self.type == 'python_run':
+            self.program = data['program']
+
+        # robot actions
+        elif self.type == 'robot_run':
+            self.program = data['program']
+
+    def __str__(self):
+        s = 't = ' + str(self.time/1000.0) + '  ' + self.type
+        if self.type in {'insert', 'remove'}:
+            s += ': "' + self.text.replace('\n', '\\n').replace('\t', '\\t') + '" at ' + str(self.offset)
+        elif self.type == 'test':
+            s += ': {0} / {1}'.format(self.n_correct, self.n_all)
+        elif self.type == 'hint':
+            if self.feedback is not None:
+                s += ': ' + ', '.join(sorted([hint['id'] for hint in self.feedback]))
+            else:
+                s += ': ?'
+        elif self.type == 'plan':
+            if self.index is not None:
+                s += ': ' + str(self.index)
+        elif self.type == 'prolog_solve':
+            s += ': "' + self.query + '"'
+        elif self.type == 'experiment':
+            s += ': ' + self.data;
+        return s
+
+    # apply this action to text
+    def apply(self, text):
+        if self.type == 'insert':
+            return text[:self.offset] + self.text + text[self.offset:]
+        elif self.type == 'remove':
+            return text[:self.offset] + text[self.offset+self.length:]
+        else:
+            return text
+
+    # reverse the application of this action
+    def unapply(self, text):
+        if self.type == 'insert':
+            return text[:self.offset] + text[self.offset+self.length:]
+        elif self.type == 'remove':
+            return text[:self.offset] + self.text + text[self.offset:]
+        else:
+            return text
+
+# parse log from database into a list of actions, cleaning up some fluff.
+# ignore non-text actions (queries and tests)
+def parse(data):
+    if data == None:
+        return [], []
+
+    actions = []
+    incorrect = set()
+
+    time = 0
+    code = ''
+    for packet in data:
+        try:
+            time += packet['dt']
+            action = Action(time, packet)
+        except:
+            # ignore any errors while decoding a packet
+            sys.stderr.write('Error decoding packet: {}\n'.format(packet))
+            continue
+
+        # skip normalization if this is the first action
+        if actions == []:
+            actions.append(action)
+            code = action.apply(code)
+            continue
+
+        # add to list of actions; modify previously added action if necessary
+        prev = actions[-1]
+
+        # remove superfluous REMOVE action when newline is inserted (due to editor auto-indent)
+        if prev.type == 'remove' and action.type == 'insert' and \
+           action.time == prev.time and \
+           action.offset == prev.offset and action.length > prev.length and \
+           action.text[action.length-prev.length:] == prev.text:
+            # discard last REMOVE action
+            code = prev.unapply(code)
+            actions.pop()
+
+            # replace current action with something better
+            length = action.length - prev.length
+            new = Action(prev.time, {'typ': 'ins', 'off': prev.offset, 'txt': action.text[:length]})
+            actions.append(new)
+            code = new.apply(code)
+
+        # remove superfluous INSERT action when newline is removed (due to editor auto-indent)
+        elif prev.type == 'remove' and action.type == 'insert' and \
+             action.time == prev.time and \
+             action.offset == prev.offset and action.length < prev.length and \
+             prev.text[prev.length-action.length:] == action.text:
+            # discard last INSERT action
+            code = prev.unapply(code)
+            actions.pop()
+
+            # replace current action with something better
+            length = prev.length - action.length
+            new = Action(prev.time, {'typ': 'rem', 'off': prev.offset, 'txt': prev.text[:length]})
+            actions.append(new)
+            code = new.apply(code)
+
+        # otherwise, simply append the current action
+        else:
+            actions.append(action)
+            code = action.apply(code)
+
+    return actions
+
+# expand any multi-char actions (does not do anything for the 99%)
+def expand(actions):
+    i = 0
+    while i < len(actions):
+        if actions[i].type == 'insert' and len(actions[i].text) > 1:
+            a = actions.pop(i)
+            for offset in range(len(a.text)):
+                actions.insert(i+offset,
+                        Action(a.time, {'typ': 'ins', 'off': a.offset+offset, 'txt': a.text[offset]}))
+            i += len(a.text)
+        elif actions[i].type == 'remove' and len(actions[i].text) > 1:
+            a = actions.pop(i)
+            for offset in range(len(a.text)):
+                actions.insert(i,
+                        Action(a.time, {'typ': 'rm', 'off': a.offset+offset, 'txt': a.text[offset]}))
+            i += len(a.text)
+        else:
+            i += 1
+
+# some sample code
+if __name__ == '__main__':
+    from db.models import Problem, Solution
+
+    # print all problem ids
+    print('problems:')
+    for problem in Problem.list():
+        print('  {}\t{}'.format(problem.id, problem.identifier))
+    print()
+
+    pid = input('enter problem id: ')
+
+    # print all attempt ids for the selected problem
+    print('users solving problem ' + str(pid) + ':')
+    attempts = Solution.filter(problem_id=pid)
+    print(', '.join([str(attempt.codeq_user_id) for attempt in attempts]))
+    print()
+
+    uid = input('enter user id: ')
+    attempt = Solution.get(problem_id=pid, codeq_user_id=uid)
+
+    try:
+        actions = parse(attempt.trace)
+        print('read ' + str(len(actions)) + ' actions from log')
+
+        print('code versions for this attempt:')
+        code = ''
+        for action in actions:
+            code = action.apply(code)
+            print(action)
+            print(code.strip())
+            print()
+    except Exception as ex:
+        sys.stderr.write('Error parsing action log: ' + str(ex))
diff --git a/monkey/patterns.py b/monkey/patterns.py
new file mode 100644
index 0000000..ae958a0
--- /dev/null
+++ b/monkey/patterns.py
@@ -0,0 +1,511 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import collections
+from itertools import combinations
+import pickle
+import random
+import re
+import sys
+
+from nltk import ParentedTree, Tree
+from nltk.tgrep import tgrep_positions
+
+from prolog.util import Token, parse as prolog_parse
+
+# construct pattern to match the structure of nodes given by [include]
+def pattern(node, include):
+    if isinstance(node, Token):
+        if node.type == 'NAME':
+            return '"{}"'.format(node.val)
+        return None
+
+    if any(n is node for n in include):
+        if node.label() == 'variable':
+            return 'variable <: "{}"'.format(node[0].val)
+        return None
+
+    label = node.label()
+    subpats = [pattern(child, include) for child in node]
+
+    if label == 'functor':
+        return '{} <1 ({})'.format(label, subpats[0])
+
+    pat = None
+    if any(subpats):
+        if label == 'and':
+            if subpats[1]:
+                pat = subpats[1]
+            if subpats[0]:
+                if pat:
+                    pat = subpats[0] + ' .. (' + pat + ')'
+                else:
+                    pat = subpats[0]
+        elif label == 'args':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' <{} ({})'.format(i+1, subpat)
+        elif label == 'binop':
+            pat = label
+            if subpats[0]:
+                pat += ' <1 ({})'.format(subpats[0])
+            pat += ' <2 ("{}")'.format(node[1].val)
+            if subpats[2]:
+                pat += ' <3 ({})'.format(subpats[2])
+        elif label == 'clause':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i])
+        elif label == 'compound':
+            if len(subpats) > 1 and subpats[1]:
+                pat = label
+                for i, subpat in enumerate(subpats):
+                    pat += ' <{} ({})'.format(i+1, subpat)
+            else:
+                return None
+        elif label == 'head':
+            pat = label
+            pat += ' <1 ({})'.format(subpats[0])
+        if not pat:
+            for s in subpats:
+                if s:
+                    pat = s
+                    break
+    return pat
+
+# construct a pattern connecting (variable) node [a] to [b]
+def connect(a, b):
+    path_a = []
+    node = a
+    while node.parent():
+        node = node.parent()
+        path_a.insert(0, node)
+
+    path_b = []
+    node = b
+    while node.parent():
+        node = node.parent()
+        path_b.insert(0, node)
+
+    common_ancestor = None
+    for i in range(min(len(path_a), len(path_b))):
+        if path_a[i] is not path_b[i]:
+            break
+        if path_a[i].label() in {'compound', 'head'}:
+            break
+        common_ancestor = path_a[i]
+
+    def node_label(node):
+        if node.label() == 'compound':
+            right = node[1]
+            nargs = 1
+            while right._label == "args" and len(right) == 2:
+                right = right[1]
+                nargs += 1
+            return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs)
+        if node.label() == 'binop':
+            return 'binop <2 "{}"'.format(node[1].val)
+        return node.label()
+    i = 0
+    while path_a[i].label() != 'clause':
+        i += 1
+
+    # path from top to common ancestor
+    pat = path_a[i].label()
+    i += 1
+    n_top = 0
+    while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]:
+        node = path_a[i]
+        i += 1
+        if node.label() == 'and':
+            continue
+        if node.parent().label() == 'and':
+            op = '<+(and)'
+        else:
+            op = '<{}'.format(node.parent_index()+1)
+        pat += ' {} ({}'.format(op, node_label(node))
+        n_top += 1
+
+    path_a = path_a[i:]
+    path_b = path_b[i:]
+
+    # path from common ancestor to [a]
+    n_a = 0
+    for node in path_a:
+        if node.label() == 'and':
+            continue
+        op = '<'
+        if node.parent().label() == 'and':
+            op = '<+(and)'
+        elif node.parent_index() is not None:
+            op = '<{}'.format(node.parent_index()+1)
+        pat += ' {} ({}'.format(op, node_label(node))
+        n_a += 1
+    pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val)
+    pat += ')' * n_a
+
+    # path from common ancestor to [b]
+    n_b = 0
+    for node in path_b:
+        if node.label() == 'and':
+            continue
+        op = '<'
+        if node.parent().label() == 'and':
+            op = '<+(and)'
+        elif node.parent_index() is not None:
+            op = '<{}'.format(node.parent_index()+1)
+        pat += ' {} ({}'.format(op, node_label(node))
+        n_b += 1
+    pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val)
+
+    pat += ')' * (n_top + n_b)
+    return pat
+
+# replace variable names with patterns and backlinks
+def postprocess(pattern):
+    macros = '@ VAR /[A-Z]/; '
+    #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern)
+    m = re.search(r'"[A-Z]_[0-9]+"', pattern)
+    nvars = 0
+    while m is not None:
+        orig_name = m.group(0)
+        n = orig_name.strip('"').split("_")[1]
+        pat_name = 'v{}'.format(nvars)
+        nvars += 1
+        pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():]
+        for m in re.finditer(orig_name, pattern):
+            pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():]
+        m = re.search(r'"([A-Z]*_[0-9]+)"', pattern)
+    return macros + pattern
+
+def postprocess_simple(pattern):
+    pattern = postprocess(pattern)
+    if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"):
+        return None
+    #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern)
+    relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern)
+    
+    # elements
+    elements = []
+    current = ""
+    i = 0
+    par = 0
+    while i < len(pattern):
+        if par > 0 and pattern[i] == ")":
+            par -= 1
+            if par == 0:
+                elements.append(current)
+                current = ""
+        if par > 0 and pattern[i] == "(":
+            par += 1
+        if par == 0 and \
+                (pattern[i:].startswith("(head") or 
+                 pattern[i:].startswith("(compound") or
+                 pattern[i:].startswith("(binop")):
+            par = 1
+        if par > 0:
+            current += pattern[i]
+        i += 1
+    # simplify variable
+    for ei, e in enumerate(elements):
+    #    #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e)
+        elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")"
+    elements = sorted(elements)
+    #print(pattern)
+    #print(relevant)
+    #return "_".join(relevant)#pattern
+    return " ".join(elements)#pattern
+
+# construct pattern to match the structure of nodes given by [include],
+# supports variables and literals
+def pattern2(node, include):
+    if isinstance(node, Token):
+        return None
+
+    label = node.label()
+    if any(n is node for n in include):
+        if label == 'literal':
+            return '"{}"'.format(node[0].val)
+        if label == 'variable':
+            return '{}'.format(label)
+        return None
+    if label == 'functor':
+        return '({} "{}")'.format(label, node[0].val)
+
+    subpats = [pattern2(child, include) for child in node]
+    pat = None
+    if any(subpats):
+        if label == 'and':
+            if subpats[1]:
+                pat = subpats[1]
+            if subpats[0]:
+                if pat:
+                    pat = subpats[0] + ' ' + pat
+                else:
+                    pat = subpats[0]
+        elif label == 'args':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' {}'.format(subpat)
+            pat = '(' + pat + ')'
+        elif label == 'unop':
+            pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')'
+        elif label == 'binop':
+            pat = label
+            if subpats[0]:
+                pat += ' {}'.format(subpats[0])
+            pat += ' "{}"'.format(node[1].val)
+            if subpats[2]:
+                pat += ' {}'.format(subpats[2])
+            pat = '(' + pat + ')'
+        elif label == 'clause':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' {}'.format(subpats[i])
+            return '(' + pat + ')'
+        elif label == 'compound':
+            if len(subpats) > 1 and subpats[1]:
+                pat = label
+                for i, subpat in enumerate(subpats):
+                    pat += ' {}'.format(subpat)
+                pat = '(' + pat + ')'
+            else:
+                return None
+        elif label == 'head':
+            pat = label
+            pat += ' {}'.format(subpats[0])
+            pat = '(' + pat + ')'
+        elif label == 'list':
+            pat = 'list '
+            if subpats[0]:
+                pat += '(h {})'.format(subpats[0])
+            if subpats[0] and subpats[1]:
+                pat += ' '
+            if subpats[1]:
+                pat += '(t {})'.format(subpats[1])
+            pat = '(' + pat + ')'
+        if not pat:
+            for s in subpats:
+                if s:
+                    pat = s
+                    break
+    return pat
+
+def get_patterns(tree):
+    orig = tree
+    if isinstance(tree, str):
+        tree = prolog_parse(tree)
+        if tree is None:
+            return
+    tree = ParentedTree.convert(tree)
+
+    # get patterns separately for each clause
+    for clause in tree:
+        all_variables = []
+        variables = collections.defaultdict(list)
+        def walk(node):
+            if isinstance(node, Tree):
+                if node.label() == 'variable':
+                    name = node[0].val
+                    variables[name].append(node)
+                    all_variables.append(node)
+                else:
+                    for child in node:
+                        walk(child)
+        walk(clause)
+
+        # connecting pairs of nodes with same variable
+        for var, nodes in variables.items():
+            for selected in combinations(nodes, 2):
+                pat = pattern2(clause, selected)
+                if pat:
+                    print(pat)
+                    yield pat, selected
+                #pat = connect(*selected)
+                #if pat:
+                #    pp = postprocess_simple(pat)
+                #    if pp:
+                #        yield pp, selected
+
+        # add singletons
+        for var, nodes in variables.items():
+            if len(nodes) == 1: 
+                pat = pattern2(clause, nodes)
+                if pat:
+                    print(pat)
+                    yield pat, nodes
+                #pat = pattern(tree, var)
+                #if pat:
+                #    pp = postprocess_simple(pat)
+                #    if pp:
+                #        yield pp, nodes
+        
+        # add patterns for literal/variable pairs
+        literals = [node for node in clause.subtrees() if node.label() == 'literal']
+        for literal in literals:
+            top = literal
+            while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}:
+                top = top.parent()
+            variables = [node for node in top.subtrees() if node.label() == 'variable']
+            for var in variables:
+                pat = pattern2(clause, [var, literal])
+                if pat:
+                    yield pat, [var, literal]
+
+#        # connecting pairs of nodes with variables
+#        for selected in combinations(all_variables, 2):
+#            pat = connect(selected[0], selected[1])
+#            if pat:
+#                yield postprocess(pat), selected
+
+#        # using pairs of nodes with variables
+#        for selected in combinations(all_variables, 2):
+#            pat = pattern(tree, selected)
+#            if pat:
+#                yield postprocess(pat), selected
+
+#        # using pairs of nodes with same variable
+#        for var, nodes in variables.items():
+#            for selected in combinations(nodes, 2):
+#                pat = pattern(tree, selected)
+#                if pat:
+#                    yield postprocess(pat), selected
+
+#       # using each variable separately
+#       for var, nodes in variables.items():
+#           pat = pattern(tree, nodes)
+#           if pat:
+#               yield postprocess(pat), nodes
+
+#       # using each goal to select variables FIXME
+#       goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}]
+#       for goal in goals:
+#           goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'}
+#           pat = pattern(tree, goal_vars)
+#           if pat:
+#               yield postprocess(pat)
+
+# nltk.tgrep does not play nice with non-Tree leaves
+def _tgrep_prepare(tree):
+    if not isinstance(tree, ParentedTree):
+        tree = ParentedTree.convert(tree)
+    def prepare(node):
+        if isinstance(node, Token) or isinstance(node, str):
+            return ParentedTree(str(node), [])
+        return ParentedTree(node.label(), [prepare(child) for child in node])
+    return prepare(tree)
+
+def find_motif(tree, motif):
+    tree = _tgrep_prepare(tree)
+    return tgrep_positions(motif, [tree])
+
+# Extract edits and other data from existing traces for each problem.
+# Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle>
+if __name__ == '__main__':
+    # Ignore traces from these users.
+    ignored_users = [
+        1, # admin
+        231, # test
+        360, # test2
+        358, # sasha
+    ]
+
+    pid = int(sys.argv[1])
+    name = sys.argv[2]
+    submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))
+
+    print('Analyzing programs for problem {}…'.format(pid))
+    ndata = {
+        'train': [],
+        'test': []
+    }
+    # get all users
+    users = set()
+    for code, data in submissions.items():
+        users |= data['users']
+    users = list(users)
+    users.sort()
+    random.Random(0).shuffle(users)
+    split = int(len(users)*0.7)
+    learn_users = set(users[:split])
+    test_users = set(users[split:])
+    print(test_users)
+
+    for code, data in submissions.items():
+        if len(code) > 1000 or prolog_parse(code) is None:
+            continue
+        if name not in code:
+            continue
+        ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users)
+        ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users)
+        #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'])
+
+    print('correct: {} ({} unique)'.format(
+        len([code for code, correct in ndata['train'] if correct]),
+        len({code for code, correct in ndata['train'] if correct})))
+    print('incorrect: {} ({} unique)'.format(
+        len([code for code, correct in ndata['train'] if not correct]),
+        len({code for code, correct in ndata['train'] if not correct})))
+
+    #iprograms.sort()
+    #random.Random(0).shuffle(programs)
+    #split = int(len(programs)*0.7)
+    #data = {
+    #    'train': programs[:split],
+    #    'test': programs[split:],
+    #}
+    data = ndata
+    
+    # extract attributes from training data
+    patterns = collections.Counter()
+    for code, correct in data['train']:
+        for pat, nodes in get_patterns(code):
+            patterns[pat] += 1
+
+    attrs = []
+    with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file:
+        for i, (pat, count) in enumerate(patterns.most_common()):
+            if count < 1:
+                break
+            attrs.append(pat)
+            print('a{}\t{}'.format(i, pat), file=pattern_file)
+
+    # check and write attributes for training/test data
+    for t in ['train', 'test']:
+        with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f:
+            # print header
+            print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
+            print('\t'.join(['d'] * (len(attrs)+2)), file=f)
+            print('meta\tclass', file=f)
+            for code, correct in data[t]:
+                record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
+
+                ## check attributes by using tgrep to find patterns
+                #tree = _tgrep_prepare(prolog_parse(code))
+                #for pat in attrs:
+                #    matches = list(tgrep_positions(pat, [tree]))
+                #    record += '\t{}'.format(bool(matches[0]))
+
+                # check attributes by enumerating patterns in code
+                code_pats = [pat for pat, nodes in get_patterns(code)]
+                for pat in attrs:
+                    record += '\t{}'.format('T' if pat in code_pats else 'F')
+
+                print(record, file=f)
-- 
cgit v1.2.1