First commit is the best commit

author: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2017-01-09 18:07:23 +0100
committer: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2017-01-09 18:07:23 +0100
commit: 27d4458613a5b61f16ad9bf59ca1de460fea3b3a (patch)
tree: 1bc98d9672af405d71c915436b0af8001d276ec5
11 files changed, 1836 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ca28708
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+prolog/parsetab.py
diff --git a/db/__init__.py b/db/__init__.py
new file mode 100644
index 0000000..783df6c
--- /dev/null
+++ b/db/__init__.py
@@ -0,0 +1,82 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import threading
+import psycopg2
+
+__all__ = ['get_connection', 'return_connection', 'setup', 'models']
+
+_module_access_lock = threading.Lock()
+
+_host = None  # the database hostname/IP
+_port = None  # the database port number
+_database = None  # the name of the database
+_username = None  # the username to access the database
+_password = None  # the password to access the database
+
+
+# database parameters setup
+
+def _get_port():
+    try:
+        return int(os.environ.get('CODEQ_DB_PORT'))
+    except:
+        return 5432
+
+
+def setup(
+        host=os.environ.get('CODEQ_DB_HOST') or 'localhost',
+        port=_get_port(),
+        database=os.environ.get('CODEQ_DB_DATABASE') or 'codeq',
+        username=os.environ.get('CODEQ_DB_USER') or 'codeq',
+        password=os.environ.get('CODEQ_DB_PASS') or 'c0d3q'
+):
+    """Sets the database location and authentication parameters."""
+    global _host, _port, _database, _username, _password
+    _host = host
+    _port = port
+    _database = database
+    _username = username
+    _password = password
+
+# connection pooling
+
+_connection_pool = []
+
+
+def get_connection():
+    """Retrieves a database connection from the connection pool."""
+    with _module_access_lock:
+        if _host is None:
+            setup()  # lazy init
+        if len(_connection_pool) > 0:
+            return _connection_pool.pop()
+    return psycopg2.connect(host=_host, port=_port, database=_database, user=_username, password=_password)
+
+
+def return_connection(connection):
+    """Returns the given database connection to the pool."""
+    try:
+        connection.rollback()  # sanity check
+    except:
+        return
+    with _module_access_lock:
+        _connection_pool.append(connection)
+
+
+if __name__ == '__main__':
+    setup()
diff --git a/db/models.py b/db/models.py
new file mode 100644
index 0000000..9edbec4
--- /dev/null
+++ b/db/models.py
@@ -0,0 +1,173 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import collections
+
+from . import get_connection, return_connection
+
+__all__ = ['CodeqUser', 'Solution']
+
+class CodeqUser(collections.namedtuple('CodeqUser', ['id', 'username', 'password', 'name', 'email', 'is_admin', 'is_active', 'date_joined', 'last_login', 'gui_lang', 'robot_address', 'saml_data', 'gui_layout'])):
+    __sql_prefix = 'select id, username, password, name, email, is_admin, is_active, date_joined, last_login, gui_lang, robot_address, saml_data, gui_layout from codeq_user'
+
+    @staticmethod
+    def get(**kwargs):
+        return _general_get(kwargs, CodeqUser, CodeqUser.__sql_prefix)
+
+    @staticmethod
+    def list():
+        return _general_list(CodeqUser, CodeqUser.__sql_prefix)
+
+    @staticmethod
+    def filter(**kwargs):
+        return _general_filter(kwargs, CodeqUser, CodeqUser.__sql_prefix)
+
+    @staticmethod
+    def solved_problems(user_id, language):
+        return _run_sql('select g.identifier, p.identifier from solution s inner join problem p on p.id = s.problem_id inner join problem_group g on g.id = p.problem_group_id inner join language l on l.id = p.language_id where s.codeq_user_id = %s and l.identifier = %s and s.done = True', (user_id, language), fetch_one=False)
+
+class Problem(collections.namedtuple('Problem', ['id', 'language', 'group', 'identifier'])):
+    __sql_prefix = '''\
+        select p.id, l.identifier, g.identifier, p.identifier
+        from problem p
+            inner join problem_group g on g.id = p.problem_group_id
+            inner join language l on l.id = p.language_id'''
+    __sql_order = 'p.language_id, p.problem_group_id, p.id'
+
+    @staticmethod
+    def get(**kwargs):
+        kwargs = {'p.'+k: v for k, v in kwargs.items()}
+        return _general_get(kwargs, Problem, Problem.__sql_prefix)
+
+    @staticmethod
+    def list():
+        return _general_list(Problem, Problem.__sql_prefix, order=Problem.__sql_order)
+
+    @staticmethod
+    def filter(**kwargs):
+        kwargs = {'p.'+k: v for k, v in kwargs.items()}
+        return _general_filter(kwargs, Problem, Problem.__sql_prefix, order=Problem.__sql_order)
+
+    # get a list of problems with the given language identifier
+    @staticmethod
+    def filter_language(language):
+        kwargs = {'l.identifier': language}
+        return _general_filter(kwargs, Problem, Problem.__sql_prefix, order=Problem.__sql_order)
+
+# known as Attempt in the original code
+class Solution(collections.namedtuple('Solution', ['id', 'done', 'content', 'problem_id', 'codeq_user_id', 'trace'])):
+    __sql_prefix = 'select id, done, content, problem_id, codeq_user_id, trace from solution'
+
+    @staticmethod
+    def get(**kwargs):
+        return _general_get(kwargs, Solution, Solution.__sql_prefix)
+
+    @staticmethod
+    def list():
+        return _general_list(Solution, Solution.__sql_prefix)
+
+    @staticmethod
+    def filter(**kwargs):
+        return _general_filter(kwargs, Solution, Solution.__sql_prefix)
+
+
+def _no_row_conversion(row):
+    return row
+
+def _general_get(kwargs_dict, clazz, sql_select, row_conversion_fn=_no_row_conversion):
+    conditions = []
+    parameters = []
+    for field_name, field_value in kwargs_dict.items():
+        conditions.append(field_name + ' = %s')
+        parameters.append(field_value)
+    if len(conditions) == 0:
+        return None
+    conn = get_connection()
+    try:
+        cur = conn.cursor('crsr1') # a named cursor: scrolling is done on the server
+        cur.arraysize = 1 # scroll unit in the number of rows
+        try:
+            cur.execute(sql_select + ' where ' + ' and '.join(conditions), parameters)
+            row = cur.fetchone()
+            if row:
+                return clazz(*row_conversion_fn(row))
+            return None
+        finally:
+            cur.close()
+    finally:
+        conn.commit()
+        return_connection(conn)
+
+def _general_filter(kwargs_dict, clazz, sql_select, row_conversion_fn=_no_row_conversion, order='id'):
+    conditions = []
+    parameters = []
+    for field_name, field_value in kwargs_dict.items():
+        conditions.append(field_name + ' = %s')
+        parameters.append(field_value)
+    if len(conditions) == 0:
+        return _general_list(clazz, sql_select)
+    conn = get_connection()
+    try:
+        cur = conn.cursor('crsr2') # a named cursor: scrolling is done on the server
+        cur.arraysize = 10000 # scroll unit in the number of rows
+        try:
+            cur.execute(sql_select + ' where ' + ' and '.join(conditions) + ' order by ' + order, parameters)
+            result = []
+            row = cur.fetchone()
+            while row:
+                result.append(clazz(*row_conversion_fn(row)))
+                row = cur.fetchone()
+            return result
+        finally:
+            cur.close()
+    finally:
+        conn.commit()
+        return_connection(conn)
+
+def _general_list(clazz, sql_select, row_conversion_fn=_no_row_conversion, order='id'):
+    conn = get_connection()
+    try:
+        cur = conn.cursor('crsr3') # a named cursor: scrolling is done on the server
+        cur.arraysize = 10000 # scroll unit in the number of rows
+        try:
+            cur.execute(sql_select + ' order by ' + order)
+            result = []
+            row = cur.fetchone()
+            while row:
+                result.append(clazz(*row_conversion_fn(row)))
+                row = cur.fetchone()
+            return result
+        finally:
+            cur.close()
+    finally:
+        conn.commit()
+        return_connection(conn)
+
+def _run_sql(sql, params, fetch_one=False):
+    conn = get_connection()
+    try:
+        cur = conn.cursor()
+        try:
+            cur.execute(sql, params)
+            if fetch_one:
+                return cur.fetchone()
+            else:
+                return cur.fetchall()
+        finally:
+            cur.close()
+    finally:
+        conn.commit()
+        return_connection(conn)
diff --git a/monkey/__init__.py b/monkey/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/monkey/__init__.py
diff --git a/monkey/action.py b/monkey/action.py
new file mode 100644
index 0000000..c2f73ad
--- /dev/null
+++ b/monkey/action.py
@@ -0,0 +1,239 @@
+#!/usr/bin/python3
+
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+
+class Action:
+    def __init__(self, abstime, data):
+        self.type = data['typ']
+        self.time = abstime   # time from start
+
+        # generic actions
+        if self.type == 'open':
+            self.timestamp = data['time']
+        elif self.type == 'ins':
+            self.type = 'insert'
+            self.offset = data['off']
+            self.text = data['txt']
+            self.length = len(self.text)
+        elif self.type == 'rm':
+            self.type = 'remove'
+            self.offset = data['off']
+            self.text = data['txt']
+            self.length = len(self.text)
+        elif self.type == 'test':
+            # data['feedback'] is a 'test_results' hint object, or a list
+            # containing such hint
+            hint = None
+            if isinstance(data['feedback'], list):
+                for hint in data['feedback']:
+                    if hint['id'] == 'test_results':
+                        break
+            else:
+                hint = data['feedback']
+            if hint is not None:
+                self.n_correct = hint['args']['passed']
+                self.n_all = hint['args']['total']
+            else:
+                self.n_correct = self.n_all = None
+        elif self.type == 'hint':
+            self.feedback = data['feedback']
+        elif self.type == 'hnt':
+            # obsolete Prolog hint action, with no additional info
+            self.type = 'hint'
+            self.feedback = None
+        elif self.type == 'plan':
+            self.index = data.get('index')
+
+        # Prolog actions
+        elif self.type == 'prolog_solve':
+            self.query = data['query']
+        elif self.type == 'slva':
+            # obsolete Prolog "solve all" action
+            self.type = 'prolog_solve'
+            self.query = data['qry']
+        elif self.type == 'experiment':
+            self.data = data['data']
+
+        # Python actions
+        elif self.type == 'python_input':
+            self.text = data['txt']
+        elif self.type == 'python_run':
+            self.program = data['program']
+
+        # robot actions
+        elif self.type == 'robot_run':
+            self.program = data['program']
+
+    def __str__(self):
+        s = 't = ' + str(self.time/1000.0) + '  ' + self.type
+        if self.type in {'insert', 'remove'}:
+            s += ': "' + self.text.replace('\n', '\\n').replace('\t', '\\t') + '" at ' + str(self.offset)
+        elif self.type == 'test':
+            s += ': {0} / {1}'.format(self.n_correct, self.n_all)
+        elif self.type == 'hint':
+            if self.feedback is not None:
+                s += ': ' + ', '.join(sorted([hint['id'] for hint in self.feedback]))
+            else:
+                s += ': ?'
+        elif self.type == 'plan':
+            if self.index is not None:
+                s += ': ' + str(self.index)
+        elif self.type == 'prolog_solve':
+            s += ': "' + self.query + '"'
+        elif self.type == 'experiment':
+            s += ': ' + self.data;
+        return s
+
+    # apply this action to text
+    def apply(self, text):
+        if self.type == 'insert':
+            return text[:self.offset] + self.text + text[self.offset:]
+        elif self.type == 'remove':
+            return text[:self.offset] + text[self.offset+self.length:]
+        else:
+            return text
+
+    # reverse the application of this action
+    def unapply(self, text):
+        if self.type == 'insert':
+            return text[:self.offset] + text[self.offset+self.length:]
+        elif self.type == 'remove':
+            return text[:self.offset] + self.text + text[self.offset:]
+        else:
+            return text
+
+# parse log from database into a list of actions, cleaning up some fluff.
+# ignore non-text actions (queries and tests)
+def parse(data):
+    if data == None:
+        return [], []
+
+    actions = []
+    incorrect = set()
+
+    time = 0
+    code = ''
+    for packet in data:
+        try:
+            time += packet['dt']
+            action = Action(time, packet)
+        except:
+            # ignore any errors while decoding a packet
+            sys.stderr.write('Error decoding packet: {}\n'.format(packet))
+            continue
+
+        # skip normalization if this is the first action
+        if actions == []:
+            actions.append(action)
+            code = action.apply(code)
+            continue
+
+        # add to list of actions; modify previously added action if necessary
+        prev = actions[-1]
+
+        # remove superfluous REMOVE action when newline is inserted (due to editor auto-indent)
+        if prev.type == 'remove' and action.type == 'insert' and \
+           action.time == prev.time and \
+           action.offset == prev.offset and action.length > prev.length and \
+           action.text[action.length-prev.length:] == prev.text:
+            # discard last REMOVE action
+            code = prev.unapply(code)
+            actions.pop()
+
+            # replace current action with something better
+            length = action.length - prev.length
+            new = Action(prev.time, {'typ': 'ins', 'off': prev.offset, 'txt': action.text[:length]})
+            actions.append(new)
+            code = new.apply(code)
+
+        # remove superfluous INSERT action when newline is removed (due to editor auto-indent)
+        elif prev.type == 'remove' and action.type == 'insert' and \
+             action.time == prev.time and \
+             action.offset == prev.offset and action.length < prev.length and \
+             prev.text[prev.length-action.length:] == action.text:
+            # discard last INSERT action
+            code = prev.unapply(code)
+            actions.pop()
+
+            # replace current action with something better
+            length = prev.length - action.length
+            new = Action(prev.time, {'typ': 'rem', 'off': prev.offset, 'txt': prev.text[:length]})
+            actions.append(new)
+            code = new.apply(code)
+
+        # otherwise, simply append the current action
+        else:
+            actions.append(action)
+            code = action.apply(code)
+
+    return actions
+
+# expand any multi-char actions (does not do anything for the 99%)
+def expand(actions):
+    i = 0
+    while i < len(actions):
+        if actions[i].type == 'insert' and len(actions[i].text) > 1:
+            a = actions.pop(i)
+            for offset in range(len(a.text)):
+                actions.insert(i+offset,
+                        Action(a.time, {'typ': 'ins', 'off': a.offset+offset, 'txt': a.text[offset]}))
+            i += len(a.text)
+        elif actions[i].type == 'remove' and len(actions[i].text) > 1:
+            a = actions.pop(i)
+            for offset in range(len(a.text)):
+                actions.insert(i,
+                        Action(a.time, {'typ': 'rm', 'off': a.offset+offset, 'txt': a.text[offset]}))
+            i += len(a.text)
+        else:
+            i += 1
+
+# some sample code
+if __name__ == '__main__':
+    from db.models import Problem, Solution
+
+    # print all problem ids
+    print('problems:')
+    for problem in Problem.list():
+        print('  {}\t{}'.format(problem.id, problem.identifier))
+    print()
+
+    pid = input('enter problem id: ')
+
+    # print all attempt ids for the selected problem
+    print('users solving problem ' + str(pid) + ':')
+    attempts = Solution.filter(problem_id=pid)
+    print(', '.join([str(attempt.codeq_user_id) for attempt in attempts]))
+    print()
+
+    uid = input('enter user id: ')
+    attempt = Solution.get(problem_id=pid, codeq_user_id=uid)
+
+    try:
+        actions = parse(attempt.trace)
+        print('read ' + str(len(actions)) + ' actions from log')
+
+        print('code versions for this attempt:')
+        code = ''
+        for action in actions:
+            code = action.apply(code)
+            print(action)
+            print(code.strip())
+            print()
+    except Exception as ex:
+        sys.stderr.write('Error parsing action log: ' + str(ex))
diff --git a/monkey/patterns.py b/monkey/patterns.py
new file mode 100644
index 0000000..ae958a0
--- /dev/null
+++ b/monkey/patterns.py
@@ -0,0 +1,511 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import collections
+from itertools import combinations
+import pickle
+import random
+import re
+import sys
+
+from nltk import ParentedTree, Tree
+from nltk.tgrep import tgrep_positions
+
+from prolog.util import Token, parse as prolog_parse
+
+# construct pattern to match the structure of nodes given by [include]
+def pattern(node, include):
+    if isinstance(node, Token):
+        if node.type == 'NAME':
+            return '"{}"'.format(node.val)
+        return None
+
+    if any(n is node for n in include):
+        if node.label() == 'variable':
+            return 'variable <: "{}"'.format(node[0].val)
+        return None
+
+    label = node.label()
+    subpats = [pattern(child, include) for child in node]
+
+    if label == 'functor':
+        return '{} <1 ({})'.format(label, subpats[0])
+
+    pat = None
+    if any(subpats):
+        if label == 'and':
+            if subpats[1]:
+                pat = subpats[1]
+            if subpats[0]:
+                if pat:
+                    pat = subpats[0] + ' .. (' + pat + ')'
+                else:
+                    pat = subpats[0]
+        elif label == 'args':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' <{} ({})'.format(i+1, subpat)
+        elif label == 'binop':
+            pat = label
+            if subpats[0]:
+                pat += ' <1 ({})'.format(subpats[0])
+            pat += ' <2 ("{}")'.format(node[1].val)
+            if subpats[2]:
+                pat += ' <3 ({})'.format(subpats[2])
+        elif label == 'clause':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i])
+        elif label == 'compound':
+            if len(subpats) > 1 and subpats[1]:
+                pat = label
+                for i, subpat in enumerate(subpats):
+                    pat += ' <{} ({})'.format(i+1, subpat)
+            else:
+                return None
+        elif label == 'head':
+            pat = label
+            pat += ' <1 ({})'.format(subpats[0])
+        if not pat:
+            for s in subpats:
+                if s:
+                    pat = s
+                    break
+    return pat
+
+# construct a pattern connecting (variable) node [a] to [b]
+def connect(a, b):
+    path_a = []
+    node = a
+    while node.parent():
+        node = node.parent()
+        path_a.insert(0, node)
+
+    path_b = []
+    node = b
+    while node.parent():
+        node = node.parent()
+        path_b.insert(0, node)
+
+    common_ancestor = None
+    for i in range(min(len(path_a), len(path_b))):
+        if path_a[i] is not path_b[i]:
+            break
+        if path_a[i].label() in {'compound', 'head'}:
+            break
+        common_ancestor = path_a[i]
+
+    def node_label(node):
+        if node.label() == 'compound':
+            right = node[1]
+            nargs = 1
+            while right._label == "args" and len(right) == 2:
+                right = right[1]
+                nargs += 1
+            return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs)
+        if node.label() == 'binop':
+            return 'binop <2 "{}"'.format(node[1].val)
+        return node.label()
+    i = 0
+    while path_a[i].label() != 'clause':
+        i += 1
+
+    # path from top to common ancestor
+    pat = path_a[i].label()
+    i += 1
+    n_top = 0
+    while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]:
+        node = path_a[i]
+        i += 1
+        if node.label() == 'and':
+            continue
+        if node.parent().label() == 'and':
+            op = '<+(and)'
+        else:
+            op = '<{}'.format(node.parent_index()+1)
+        pat += ' {} ({}'.format(op, node_label(node))
+        n_top += 1
+
+    path_a = path_a[i:]
+    path_b = path_b[i:]
+
+    # path from common ancestor to [a]
+    n_a = 0
+    for node in path_a:
+        if node.label() == 'and':
+            continue
+        op = '<'
+        if node.parent().label() == 'and':
+            op = '<+(and)'
+        elif node.parent_index() is not None:
+            op = '<{}'.format(node.parent_index()+1)
+        pat += ' {} ({}'.format(op, node_label(node))
+        n_a += 1
+    pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val)
+    pat += ')' * n_a
+
+    # path from common ancestor to [b]
+    n_b = 0
+    for node in path_b:
+        if node.label() == 'and':
+            continue
+        op = '<'
+        if node.parent().label() == 'and':
+            op = '<+(and)'
+        elif node.parent_index() is not None:
+            op = '<{}'.format(node.parent_index()+1)
+        pat += ' {} ({}'.format(op, node_label(node))
+        n_b += 1
+    pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val)
+
+    pat += ')' * (n_top + n_b)
+    return pat
+
+# replace variable names with patterns and backlinks
+def postprocess(pattern):
+    macros = '@ VAR /[A-Z]/; '
+    #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern)
+    m = re.search(r'"[A-Z]_[0-9]+"', pattern)
+    nvars = 0
+    while m is not None:
+        orig_name = m.group(0)
+        n = orig_name.strip('"').split("_")[1]
+        pat_name = 'v{}'.format(nvars)
+        nvars += 1
+        pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():]
+        for m in re.finditer(orig_name, pattern):
+            pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():]
+        m = re.search(r'"([A-Z]*_[0-9]+)"', pattern)
+    return macros + pattern
+
+def postprocess_simple(pattern):
+    pattern = postprocess(pattern)
+    if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"):
+        return None
+    #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern)
+    relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern)
+    
+    # elements
+    elements = []
+    current = ""
+    i = 0
+    par = 0
+    while i < len(pattern):
+        if par > 0 and pattern[i] == ")":
+            par -= 1
+            if par == 0:
+                elements.append(current)
+                current = ""
+        if par > 0 and pattern[i] == "(":
+            par += 1
+        if par == 0 and \
+                (pattern[i:].startswith("(head") or 
+                 pattern[i:].startswith("(compound") or
+                 pattern[i:].startswith("(binop")):
+            par = 1
+        if par > 0:
+            current += pattern[i]
+        i += 1
+    # simplify variable
+    for ei, e in enumerate(elements):
+    #    #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e)
+        elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")"
+    elements = sorted(elements)
+    #print(pattern)
+    #print(relevant)
+    #return "_".join(relevant)#pattern
+    return " ".join(elements)#pattern
+
+# construct pattern to match the structure of nodes given by [include],
+# supports variables and literals
+def pattern2(node, include):
+    if isinstance(node, Token):
+        return None
+
+    label = node.label()
+    if any(n is node for n in include):
+        if label == 'literal':
+            return '"{}"'.format(node[0].val)
+        if label == 'variable':
+            return '{}'.format(label)
+        return None
+    if label == 'functor':
+        return '({} "{}")'.format(label, node[0].val)
+
+    subpats = [pattern2(child, include) for child in node]
+    pat = None
+    if any(subpats):
+        if label == 'and':
+            if subpats[1]:
+                pat = subpats[1]
+            if subpats[0]:
+                if pat:
+                    pat = subpats[0] + ' ' + pat
+                else:
+                    pat = subpats[0]
+        elif label == 'args':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' {}'.format(subpat)
+            pat = '(' + pat + ')'
+        elif label == 'unop':
+            pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')'
+        elif label == 'binop':
+            pat = label
+            if subpats[0]:
+                pat += ' {}'.format(subpats[0])
+            pat += ' "{}"'.format(node[1].val)
+            if subpats[2]:
+                pat += ' {}'.format(subpats[2])
+            pat = '(' + pat + ')'
+        elif label == 'clause':
+            pat = label
+            for i, subpat in enumerate(subpats):
+                if subpat:
+                    pat += ' {}'.format(subpats[i])
+            return '(' + pat + ')'
+        elif label == 'compound':
+            if len(subpats) > 1 and subpats[1]:
+                pat = label
+                for i, subpat in enumerate(subpats):
+                    pat += ' {}'.format(subpat)
+                pat = '(' + pat + ')'
+            else:
+                return None
+        elif label == 'head':
+            pat = label
+            pat += ' {}'.format(subpats[0])
+            pat = '(' + pat + ')'
+        elif label == 'list':
+            pat = 'list '
+            if subpats[0]:
+                pat += '(h {})'.format(subpats[0])
+            if subpats[0] and subpats[1]:
+                pat += ' '
+            if subpats[1]:
+                pat += '(t {})'.format(subpats[1])
+            pat = '(' + pat + ')'
+        if not pat:
+            for s in subpats:
+                if s:
+                    pat = s
+                    break
+    return pat
+
+def get_patterns(tree):
+    orig = tree
+    if isinstance(tree, str):
+        tree = prolog_parse(tree)
+        if tree is None:
+            return
+    tree = ParentedTree.convert(tree)
+
+    # get patterns separately for each clause
+    for clause in tree:
+        all_variables = []
+        variables = collections.defaultdict(list)
+        def walk(node):
+            if isinstance(node, Tree):
+                if node.label() == 'variable':
+                    name = node[0].val
+                    variables[name].append(node)
+                    all_variables.append(node)
+                else:
+                    for child in node:
+                        walk(child)
+        walk(clause)
+
+        # connecting pairs of nodes with same variable
+        for var, nodes in variables.items():
+            for selected in combinations(nodes, 2):
+                pat = pattern2(clause, selected)
+                if pat:
+                    print(pat)
+                    yield pat, selected
+                #pat = connect(*selected)
+                #if pat:
+                #    pp = postprocess_simple(pat)
+                #    if pp:
+                #        yield pp, selected
+
+        # add singletons
+        for var, nodes in variables.items():
+            if len(nodes) == 1: 
+                pat = pattern2(clause, nodes)
+                if pat:
+                    print(pat)
+                    yield pat, nodes
+                #pat = pattern(tree, var)
+                #if pat:
+                #    pp = postprocess_simple(pat)
+                #    if pp:
+                #        yield pp, nodes
+        
+        # add patterns for literal/variable pairs
+        literals = [node for node in clause.subtrees() if node.label() == 'literal']
+        for literal in literals:
+            top = literal
+            while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}:
+                top = top.parent()
+            variables = [node for node in top.subtrees() if node.label() == 'variable']
+            for var in variables:
+                pat = pattern2(clause, [var, literal])
+                if pat:
+                    yield pat, [var, literal]
+
+#        # connecting pairs of nodes with variables
+#        for selected in combinations(all_variables, 2):
+#            pat = connect(selected[0], selected[1])
+#            if pat:
+#                yield postprocess(pat), selected
+
+#        # using pairs of nodes with variables
+#        for selected in combinations(all_variables, 2):
+#            pat = pattern(tree, selected)
+#            if pat:
+#                yield postprocess(pat), selected
+
+#        # using pairs of nodes with same variable
+#        for var, nodes in variables.items():
+#            for selected in combinations(nodes, 2):
+#                pat = pattern(tree, selected)
+#                if pat:
+#                    yield postprocess(pat), selected
+
+#       # using each variable separately
+#       for var, nodes in variables.items():
+#           pat = pattern(tree, nodes)
+#           if pat:
+#               yield postprocess(pat), nodes
+
+#       # using each goal to select variables FIXME
+#       goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}]
+#       for goal in goals:
+#           goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'}
+#           pat = pattern(tree, goal_vars)
+#           if pat:
+#               yield postprocess(pat)
+
+# nltk.tgrep does not play nice with non-Tree leaves
+def _tgrep_prepare(tree):
+    if not isinstance(tree, ParentedTree):
+        tree = ParentedTree.convert(tree)
+    def prepare(node):
+        if isinstance(node, Token) or isinstance(node, str):
+            return ParentedTree(str(node), [])
+        return ParentedTree(node.label(), [prepare(child) for child in node])
+    return prepare(tree)
+
+def find_motif(tree, motif):
+    tree = _tgrep_prepare(tree)
+    return tgrep_positions(motif, [tree])
+
+# Extract edits and other data from existing traces for each problem.
+# Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle>
+if __name__ == '__main__':
+    # Ignore traces from these users.
+    ignored_users = [
+        1, # admin
+        231, # test
+        360, # test2
+        358, # sasha
+    ]
+
+    pid = int(sys.argv[1])
+    name = sys.argv[2]
+    submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))
+
+    print('Analyzing programs for problem {}…'.format(pid))
+    ndata = {
+        'train': [],
+        'test': []
+    }
+    # get all users
+    users = set()
+    for code, data in submissions.items():
+        users |= data['users']
+    users = list(users)
+    users.sort()
+    random.Random(0).shuffle(users)
+    split = int(len(users)*0.7)
+    learn_users = set(users[:split])
+    test_users = set(users[split:])
+    print(test_users)
+
+    for code, data in submissions.items():
+        if len(code) > 1000 or prolog_parse(code) is None:
+            continue
+        if name not in code:
+            continue
+        ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users)
+        ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users)
+        #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'])
+
+    print('correct: {} ({} unique)'.format(
+        len([code for code, correct in ndata['train'] if correct]),
+        len({code for code, correct in ndata['train'] if correct})))
+    print('incorrect: {} ({} unique)'.format(
+        len([code for code, correct in ndata['train'] if not correct]),
+        len({code for code, correct in ndata['train'] if not correct})))
+
+    #iprograms.sort()
+    #random.Random(0).shuffle(programs)
+    #split = int(len(programs)*0.7)
+    #data = {
+    #    'train': programs[:split],
+    #    'test': programs[split:],
+    #}
+    data = ndata
+    
+    # extract attributes from training data
+    patterns = collections.Counter()
+    for code, correct in data['train']:
+        for pat, nodes in get_patterns(code):
+            patterns[pat] += 1
+
+    attrs = []
+    with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file:
+        for i, (pat, count) in enumerate(patterns.most_common()):
+            if count < 1:
+                break
+            attrs.append(pat)
+            print('a{}\t{}'.format(i, pat), file=pattern_file)
+
+    # check and write attributes for training/test data
+    for t in ['train', 'test']:
+        with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f:
+            # print header
+            print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
+            print('\t'.join(['d'] * (len(attrs)+2)), file=f)
+            print('meta\tclass', file=f)
+            for code, correct in data[t]:
+                record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
+
+                ## check attributes by using tgrep to find patterns
+                #tree = _tgrep_prepare(prolog_parse(code))
+                #for pat in attrs:
+                #    matches = list(tgrep_positions(pat, [tree]))
+                #    record += '\t{}'.format(bool(matches[0]))
+
+                # check attributes by enumerating patterns in code
+                code_pats = [pat for pat, nodes in get_patterns(code)]
+                for pat in attrs:
+                    record += '\t{}'.format('T' if pat in code_pats else 'F')
+
+                print(record, file=f)
diff --git a/prolog/__init__.py b/prolog/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/prolog/__init__.py
diff --git a/prolog/lexer.py b/prolog/lexer.py
new file mode 100644
index 0000000..5023e57
--- /dev/null
+++ b/prolog/lexer.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import ply.lex as lex
+
+# LEXER
+
+#states = (
+#    ('comment', 'exclusive'),
+#)
+
+# tokens; treat operators as names if followed by (
+operators = {
+    r':-':  'FROM',
+    r'-->':  'FROMDCG',
+    r'->':  'IMPLIES',
+    r'\+':  'NOT',
+    r'=':   'EQU',
+    r'\=':  'NEQU',
+    r'==':  'EQ',
+    r'\==': 'NEQ',
+    r'=..': 'UNIV',
+    r'is':  'IS',
+    r'=:=': 'EQA',
+    r'=\=': 'NEQA',
+    r'<':   'LT',
+    r'=<':  'LE',
+    r'>':   'GT',
+    r'>=':  'GE',
+    r'@<':  'LTL',
+    r'@=<': 'LEL',
+    r'@>':  'GTL',
+    r'@>=': 'GEL',
+    r'#=':  'EQFD',
+    r'#\=': 'NEQFD',
+    r'#<':  'LTFD',
+    r'#=<': 'LEFD',
+    r'#>':  'GTFD',
+    r'#>=': 'GEFD',
+    r'in':  'IN',
+    r'ins': 'INS',
+    r'..':  'THROUGH',
+    r'+':   'PLUS',
+    r'-':   'MINUS',
+    r'*':   'STAR',
+    r'/':   'DIV',
+    r'//':  'IDIV',
+    r'mod': 'MOD',
+    r'**':  'POW',
+    r'^':   'POW',
+    r'.':   'PERIOD',
+    r',':   'COMMA',
+    r';':   'SEMI'
+}
+tokens = sorted(list(operators.values())) + [
+    'UINTEGER', 'UREAL',
+    'NAME', 'VARIABLE', 'STRING',
+    'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'PIPE', 'LBRACE', 'RBRACE',
+    'INVALID'
+]
+
+# punctuation
+t_LBRACKET = r'\['
+t_RBRACKET = r'\]'
+t_LPAREN = r'\('
+t_RPAREN = r'\)'
+t_PIPE = r'\|'
+t_LBRACE = r'{'
+t_RBRACE = r'}'
+
+t_UINTEGER = r'[0-9]+'
+t_UREAL    = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan'
+t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*'
+t_STRING   = r'"(""|\\.|[^\"])*"'
+
+# no support for nested comments yet
+def t_comment(t):
+    r'(/\*(.|\n)*?\*/)|(%.*)'
+    pass
+
+def t_NAME(t):
+    r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|,"
+    if t.value == ',' or \
+       t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(':
+        t.type = operators.get(t.value, 'NAME')
+    return t
+
+t_ignore  = ' \t'
+
+def t_newline(t):
+    r'\n+'
+    t.lexer.lineno += len(t.value)
+
+def t_error(t):
+    # TODO send this to stderr
+    #print("Illegal character '" + t.value[0] + "'")
+    t.type = 'INVALID'
+    t.value = t.value[0]
+    t.lexer.skip(1)
+    return t
+
+lexer = lex.lex(errorlog=lex.NullLogger())
+
+if __name__ == '__main__':
+    while True:
+        try:
+            s = input('> ')
+        except EOFError:
+            break
+        if not s:
+            continue
+
+        lexer.input(s)
+        tokens = list(lexer)
+        print(tokens)
diff --git a/prolog/parser.py b/prolog/parser.py
new file mode 100644
index 0000000..4d4f9e9
--- /dev/null
+++ b/prolog/parser.py
@@ -0,0 +1,236 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from nltk import Tree
+import ply.yacc as yacc
+from .lexer import tokens
+from .util import Token
+
+# PARSER
+precedence = (
+    ('nonassoc', 'FROM', 'FROMDCG'),
+    ('right', 'PIPE'),
+    ('right', 'IMPLIES'),
+    ('right', 'NOT'),
+    ('nonassoc', 'EQU', 'NEQU', 'EQ', 'NEQ', 'UNIV', 'IS', 'EQA', 'NEQA', 'LT', 'LE', 'GT', 'GE', 'LTL', 'LEL', 'GTL', 'GEL', 'IN', 'INS', 'THROUGH', 'EQFD', 'NEQFD', 'LTFD', 'LEFD', 'GTFD', 'GEFD'),
+    ('left', 'PLUS', 'MINUS'),
+    ('left', 'STAR', 'DIV', 'IDIV', 'MOD'),
+    ('nonassoc', 'POW'),
+    ('right', 'UMINUS', 'UPLUS'),
+    ('nonassoc', 'UINTEGER', 'UREAL'),
+    ('nonassoc', 'NAME', 'VARIABLE', 'STRING'),
+    ('nonassoc', 'PERIOD'),
+    ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE')
+)
+
+def make_token(p, n):
+    lextoken = p.slice[n]
+    return Token(lextoken.type, lextoken.value, lextoken.lexpos)
+
+def p_text_empty(p):
+    'text : '
+    p[0] = Tree('text', [])
+def p_text_clause(p):
+    'text : text clause'
+    p[0] = p[1]
+    p[0].append(p[2])
+
+def p_clause_head(p):
+    'clause : head PERIOD'
+    p[0] = Tree('clause', [p[1]])
+def p_clause_rule(p):
+    '''clause : head FROM or PERIOD
+              | head FROMDCG or PERIOD'''
+    p[0] = Tree('clause', [p[1], p[3]])
+
+def p_head(p):
+    'head : term'
+    p[0] = Tree('head', [p[1]])
+
+def p_or_single(p):
+    'or : if'
+    p[0] = p[1]
+def p_or_if(p):
+    'or : or SEMI if'
+    p[0] = Tree('or', [p[1], p[3]])
+
+def p_if_single(p):
+    'if : and'
+    p[0] = p[1]
+def p_if_and(p):
+    'if : and IMPLIES if'
+    p[0] = Tree('if', [p[1], p[3]])
+
+def p_and_single(p):
+    'and : term'
+    p[0] = p[1]
+def p_and_term(p):
+    'and : term COMMA and'
+    p[0] = Tree('and', [p[1], p[3]])
+
+# Special case for zero-arity predicates supported by SWI-Prolog.
+def p_term_functor_zero(p):
+    'term : functor LPAREN RPAREN'
+    # No whitespace allowed between functor and LPAREN.
+    t2 = make_token(p, 2)
+    if p[1][0].pos + len(p[1][0].val) < t2.pos:
+        raise SyntaxError('whitespace before ' + str(t2))
+    p[0] = Tree('compound', [p[1]])
+def p_term_functor(p):
+    'term : functor LPAREN args RPAREN'
+    # No whitespace allowed between functor and LPAREN.
+    t2 = make_token(p, 2)
+    if p[1][0].pos + len(p[1][0].val) < t2.pos:
+        raise SyntaxError('whitespace before ' + str(t2))
+    p[0] = Tree('compound', [p[1], p[3]])
+
+def p_term_or(p):
+    'term : LPAREN or RPAREN'
+    p[0] = p[2]
+def p_term_binary(p):
+    '''term : term PLUS term
+            | term MINUS term
+            | term STAR term
+            | term POW term
+            | term DIV term
+            | term IDIV term
+            | term MOD term
+
+            | term EQU term
+            | term NEQU term
+            | term EQ term
+            | term NEQ term
+            | term UNIV term
+            | term IS term
+
+            | term EQA term
+            | term NEQA term
+            | term LT term
+            | term LE term
+            | term GT term
+            | term GE term
+
+            | term LTL term
+            | term LEL term
+            | term GTL term
+            | term GEL term
+
+            | term PIPE term
+            | term THROUGH term
+            | term IN term
+            | term INS term
+            | term EQFD term
+            | term NEQFD term
+            | term LTFD term
+            | term LEFD term
+            | term GTFD term
+            | term GEFD term'''
+    p[0] = Tree('binop', [p[1], make_token(p, 2), p[3]])
+def p_term_unary(p):
+    '''term : NOT term
+            | MINUS term %prec UMINUS
+            | PLUS term %prec UPLUS'''
+    p[0] = Tree('unop', [make_token(p, 1), p[2]])
+def p_term_list(p):
+    'term : list'
+    p[0] = p[1]
+
+def p_term_variable(p):
+    'term : VARIABLE'
+    p[0] = Tree('variable', [make_token(p, 1)])
+def p_term_simple(p):
+    '''term : STRING
+            | UINTEGER
+            | UREAL'''
+    p[0] = Tree('literal', [make_token(p, 1)])
+def p_term_name(p):
+    'term : NAME'
+    p[0] = make_token(p, 1)
+
+def p_term_clpr(p):
+    'term : LBRACE clpr RBRACE'
+    p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
+
+# compound term arguments
+def p_args_single(p):
+    'args : term'
+    p[0] = Tree('args', [p[1]])
+def p_args_multiple(p):
+    'args : term COMMA args'
+    p[0] = Tree('args', [p[1], p[3]])
+
+# list elements
+def p_elems_single(p):
+    'elems : term'
+    if isinstance(p[1], Tree) and p[1].label() == 'binop' and p[1][1].type == 'PIPE':
+        p[0] = Tree('list', [p[1][0]])
+        if p[1][2] != Tree('term', [Token(type='NIL', val='[]')]):
+            p[0].append(p[1][2])
+    else:
+        p[0] = Tree('list', [p[1], Token(type='NIL', val='[]')])
+def p_elems_multiple(p):
+    'elems : term COMMA elems'
+    p[0] = Tree('list', [p[1], p[3]])
+
+def p_list_empty(p):
+    'list : LBRACKET RBRACKET'
+    p[0] = Tree('literal', [Token(type='NIL', val='[]')])
+def p_list(p):
+    'list : LBRACKET elems RBRACKET'
+    p[0] = p[2]
+
+def p_functor(p):
+    'functor : NAME'
+    p[0] = Tree('functor', [make_token(p, 1)])
+
+# CLP(R) syntax
+def p_clpr_single(p):
+    'clpr : clpr_constr'
+    p[0] = Tree('clpr', [p[1]])
+def p_clpr_more(p):
+    '''clpr : clpr_constr COMMA clpr
+            | clpr_constr SEMI clpr'''
+    p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]])
+# XXX temporary until the new parser is in place, this also covers { } notation for DCGs
+def p_clpr_constr(p):
+    'clpr_constr : term'
+    p[0] = p[1]
+
+def p_error(t):
+    if t is None:
+        raise SyntaxError('unexpected end of file')
+    else:
+        raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value))
+
+parser = yacc.yacc(debug=False)
+
+if __name__ == '__main__':
+    from .util import stringify
+    while True:
+        try:
+            s = input('> ')
+        except EOFError:
+            break
+        if not s:
+            continue
+        ast = parser.parse(s)
+        def pp(node):
+            if isinstance(node, Tree):
+                return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')'
+            return '"' + str(node) + '"'
+        print(pp(ast))
+        #ast.pretty_print()
+        #print(stringify(ast))
diff --git a/prolog/util.py b/prolog/util.py
new file mode 100644
index 0000000..402b920
--- /dev/null
+++ b/prolog/util.py
@@ -0,0 +1,238 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from collections import namedtuple
+from collections.abc import Iterable
+import string
+
+from nltk import Tree
+
+# Stores a token's type and value, and optionally the position of the first
+# character in the lexed stream.
+class Token(namedtuple('Token', ['type', 'val', 'pos'])):
+    __slots__ = ()
+
+    # Custom constructor to support default parameters.
+    def __new__(cls, type, val='', pos=None):
+        return super(Token, cls).__new__(cls, type, val, pos)
+
+    def __str__(self):
+        return str(self.val)
+
+    # Only consider type and value when comparing tokens. There is probably a
+    # cleaner way of doing this.
+    __eq__ = lambda x, y: x[0] == y[0] and x[1] == y[1]
+    __ne__ = lambda x, y: x[0] != y[0] or x[1] != y[1]
+    __lt__ = lambda x, y: tuple.__lt__(x[0:2], y[0:2])
+    __le__ = lambda x, y: tuple.__le__(x[0:2], y[0:2])
+    __ge__ = lambda x, y: tuple.__ge__(x[0:2], y[0:2])
+    __gt__ = lambda x, y: tuple.__gt__(x[0:2], y[0:2])
+
+    # Only hash token's value (we don't care about position, and types are
+    # determined by values).
+    def __hash__(self):
+        return hash(self[1])
+
+    # Return a copy of this token, possibly modifying some fields.
+    def clone(self, type=None, val=None, pos=None):
+        return Token(self.type if type is None else type,
+                     self.val if val is None else val,
+                     self.pos if pos is None else pos)
+
+from .lexer import lexer, operators
+from .parser import parser
+
+def parse(code):
+    try:
+        return parser.parse(code)
+    except SyntaxError:
+        return None
+
+# Return a list of tokens in [text].
+def tokenize(text):
+    lexer.input(text)
+    return [Token(t.type, t.value, t.lexpos) for t in lexer]
+
+# Return a one-line string representation of [obj] which may be a Tree or a
+# list of tokens.
+def stringify(obj):
+    if isinstance(obj, Token):
+        if obj.type in ('PERIOD', 'COMMA'):
+            return str(obj) + ' '
+        if obj.type in operators.values():
+            return ' ' + str(obj) + ' '
+        return str(obj)
+    if isinstance(obj, Iterable):
+        if isinstance(obj, Tree) and obj.label() == 'clause':
+            return ''.join([stringify(child) for child in obj]) + '\n'
+        return ''.join([stringify(child) for child in obj])
+
+# Return a canonical name for the [n]th variable in scope.
+def canonical_varname(n):
+    names = string.ascii_uppercase
+    if n < len(names):
+        return names[n]
+    return 'X{}'.format(n)
+
+# Rename variables in [tokens] to A0, A1, A2,… in order of appearance.
+def rename_vars_list(tokens, names=None):
+    if names is None:
+        names = {}
+    next_id = len(names)
+
+    # Return a new list.
+    tokens = list(tokens)
+    for i, t in enumerate(tokens):
+        if t.type == 'VARIABLE' and t.val != '_':
+            cur_name = t.val
+            if cur_name not in names:
+                names[cur_name] = canonical_varname(next_id)
+                next_id += 1
+            tokens[i] = t.clone(val=names[cur_name])
+    return tokens
+
+# Rename variables in AST rooted at [root] to A0, A1, A2,… in order of
+# appearance.
+def rename_vars_ast(root, fixed_names=None):
+    if fixed_names is None:
+        fixed_names = {}
+    names = {}
+    next_id = len(fixed_names) + len(names)
+
+    def rename_aux(node):
+        nonlocal fixed_names, names, next_id
+        if isinstance(node, Tree):
+            if node.label() == 'clause':
+                names = {}
+                next_id = len(fixed_names) + len(names)
+            new_children = [rename_aux(child) for child in node]
+            new_node = Tree(node.label(), new_children)
+        elif isinstance(node, Token):
+            if node.type == 'VARIABLE':
+                token = node
+                if token.val.startswith('_'):
+                    new_node = token.clone(val=canonical_varname(next_id))
+                    next_id += 1
+                else:
+                    cur_name = token.val
+                    if cur_name in fixed_names:
+                        new_name = fixed_names[cur_name]
+                    else:
+                        if cur_name not in names:
+                            names[cur_name] = canonical_varname(next_id)
+                            next_id += 1
+                        new_name = names[cur_name]
+                    new_node = token.clone(val=new_name)
+            else:
+                new_node = node
+        return new_node
+    return rename_aux(root)
+
+# Yield "interesting" parts of a Prolog AST as lists of tokens.
+def interesting_ranges(ast, path=()):
+    if ast.label() in {'clause', 'head', 'or', 'if', 'and'}:
+        if ast.label() == 'clause':
+            # Special case for clause with one goal.
+            if len(ast) == 4 and ast[2].label() == 'term':
+                terminals = ast[2].leaves()
+                yield terminals, path + (ast.label(), 'and')
+
+        if ast.label() == 'and':
+            for i in range(0, len(ast), 2):
+                for j in range(i, len(ast), 2):
+                    subs = ast[i:j+1]
+                    terminals = []
+                    for s in subs:
+                        terminals.extend([s] if isinstance(s, Token) else s.leaves())
+                    # We want at least some context.
+                    if len(terminals) > 1:
+                        yield terminals, path + (ast.label(),)
+        else:
+            terminals = ast.leaves()
+            # We want at least some context.
+            if len(terminals) > 1:
+                yield terminals, path + (ast.label(),)
+
+    for subtree in ast:
+        if isinstance(subtree, Tree):
+            yield from interesting_ranges(subtree, path + (ast.label(),))
+
+# Map "formal" variable names in the edit a→b to actual names in code [tokens].
+# The set [variables] contains all variable names in the current scope. These
+# are used in cases such as [A]→[A,B], where the edit introduces new variables.
+# Return a new version of b with actual variable names.
+def map_vars(a, b, tokens, variables):
+    mapping = {}
+    new_index = 0
+    for i in range(len(a)):
+        if tokens[i].type == 'VARIABLE':
+            formal_name = a[i].val
+            if tokens[i].val != '_':
+                actual_name = tokens[i].val
+            else:
+                actual_name = 'New'+str(new_index)
+                new_index += 1
+            mapping[formal_name] = actual_name
+
+    remaining_formal = [t.val for t in b if t.type == 'VARIABLE' and t.val not in mapping.keys()]
+    remaining_actual = [var for var in variables if var not in mapping.values()]
+
+    while len(remaining_actual) < len(remaining_formal):
+        remaining_actual.append('New'+str(new_index))
+        new_index += 1
+
+    for i, formal_name in enumerate(remaining_formal):
+        mapping[formal_name] = remaining_actual[i]
+
+    return [t if t.type != 'VARIABLE' else t.clone(val=mapping[t.val]) for t in b]
+
+# Return a set of predicate names (e.g. conc/3) used in [program].
+def used_predicates(program):
+    predicates = set()
+    def walk(tree, dcg=False):
+        if isinstance(tree, Tree):
+            # DCG predicates can be called without parameters
+            if tree.label() == 'clause' and len(tree) == 4 and \
+                    tree[1].type == 'FROMDCG':
+                dcg = True
+            if tree.label() == 'term' and len(tree) >= 3 and \
+                    isinstance(tree[0], Tree) and tree[0].label() == 'functor':
+                if len(tree) == 3:
+                    predicates.add('{}/0'.format(tree[0][0]))
+                else:
+                    predicates.add('{}/{}'.format(tree[0][0], (len(tree[2])+1)//2))
+            for subtree in tree:
+                walk(subtree, dcg)
+        elif isinstance(tree, Token):
+            if dcg and tree.type == 'NAME':
+                predicates.add('{}/{}'.format(tree.val, 2))
+                predicates.add('{}/{}'.format(tree.val, 3))
+                predicates.add('{}/{}'.format(tree.val, 4))
+    tree = parse(program)
+    if tree is not None:
+        walk(tree)
+    return predicates
+
+# Basic sanity check.
+if __name__ == '__main__':
+    var_names = {}
+    before = rename_vars(tokenize("dup([A0|A1], [A2|A3])"), var_names)
+    after = rename_vars(tokenize("dup([A0|A1], [A5, A4|A3])"), var_names)
+
+    line = lines[0]
+    variables = [t.val for t in tokenize(code) if t.type == 'VARIABLE']
+    mapped = map_vars(before, after, line, variables)
+    print(mapped)
diff --git a/test-rules.py b/test-rules.py
new file mode 100755
index 0000000..bb89e7e
--- /dev/null
+++ b/test-rules.py
@@ -0,0 +1,225 @@
+#!/usr/bin/python3
+
+import collections
+import os.path
+import pickle
+import re
+from statistics import mean
+import sys
+
+from termcolor import colored
+
+from monkey.action import parse as parse_trace
+from monkey.patterns import get_patterns
+from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize
+
+# script arguments
+solutions_file = sys.argv[1]
+pid = int(sys.argv[2])
+data_dir = sys.argv[3]
+
+attributes_file = os.path.join(data_dir, 'attributes')
+rules_file = os.path.join(data_dir, 'rules')
+users_file = os.path.join(data_dir, 'users-test')
+programs_file = os.path.join(data_dir, 'programs.pickle')
+
+# read test results for known programs
+test = pickle.load(open(programs_file, 'rb'))
+
+# read traces
+users = [int(line.strip()) for line in open(users_file, 'r').readlines()]
+traces = {}
+for solution in pickle.load(open(solutions_file, 'rb')):
+    if solution.problem_id == pid and solution.codeq_user_id in users:
+        traces[solution.codeq_user_id] = solution.trace
+
+# read attributes
+attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])
+
+class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
+    def __str__(self):
+        s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
+        s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
+        return s
+
+# read rules
+rules = []
+for line in open(rules_file, 'r').readlines():
+    match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
+    if match:
+        m = tuple(match.groups())
+        condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
+        rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
+        #print(rules[-1])
+    else:
+        print('Did not understand rule:', line.strip())
+
+def color_print(text, ranges):
+    i = 0
+    for start, length, color in sorted(ranges):
+        # ignore overlapping ranges
+        if start < i:
+            continue
+        print(text[i:start], end='')
+        print(colored(text[start:start+length], color), end='')
+        i = start + length
+    print(text[i:])
+
+# check if given patterns match the rule
+def check_rule(rule, patterns):
+    ret_patterns = []
+    for rule_pattern, yes in rule.condition:
+        if yes:
+            # this pattern must be present
+            for pattern, nodes in patterns:
+                if pattern == rule_pattern:
+                    ret_patterns.append((rule_pattern, nodes))
+        else:
+            # this pattern must not be present
+            if rule_pattern in [p[0] for p in patterns]:
+                return []
+    return ret_patterns
+
+# keep track of when each suggestion was applied
+all_suggestions = []
+# programs with no matching rule
+unsuggestable = collections.Counter()
+
+for user, trace in traces.items():
+    # get submissions from trace
+    programs = []
+    code = ''
+    for action in parse_trace(trace):
+        code = action.apply(code)
+        if action.type == 'test':
+            if prolog_parse(code) is None:
+                continue
+            normalized_code = stringify(rename_vars_list(tokenize(code)))
+            if programs and normalized_code == programs[-1][0]:
+                continue
+            correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
+            programs.append((normalized_code, correct))
+            # ignore actions after first correct submission
+            if correct:
+                break
+
+    # ignore traces with no / only correct submissions
+    if not any(p[1] for p in programs) or all(p[1] for p in programs):
+        continue
+
+    suggested = []
+    for i, (program, correct) in enumerate(programs):
+        program_patterns = list(get_patterns(program))
+        #for p in program_patterns:
+        #    print(p[0])
+        #print()
+
+        # check if previously suggested rules match
+        for s in suggested:
+            s['passed'] += 1
+            match = check_rule(s['rule'], program_patterns)
+            if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
+                s['rule'].klass == 'F' and not match):
+                s['matched'].append(s['passed'])
+
+        # only check programs until first correct submission
+        if correct:
+            print(str(i) + ' PASS\t' + program)
+            print()
+            break
+
+        # check rules in order, buggy rules first
+        found = False
+        for rule in (
+                [r for r in rules if r.klass == 'F'] +
+                [r for r in rules if r.klass == 'T']):
+            match = check_rule(rule, program_patterns)
+            if (rule.klass == 'F' and not match or
+                rule.klass == 'T' and len(match) != len(rule.condition)-1):
+                continue
+            found = True
+
+            # store suggestion to see if it was implemented later
+            if not any(s['program'] == program and s['rule'] == rule for s in suggested):
+                # passed: how many submission before PASS
+                # matched: list of submissions where the suggested rule matched
+                #          (the current submission has index 0, the next 1 and so on)
+                suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})
+
+            # get highlights
+            highlight = set()
+            for m in match:
+                for n in m[1]:
+                    highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))
+
+            # print highighted program
+            print(str(i) + ' FAIL', end='\t')
+            color_print(program, list(highlight))
+
+            # print rule
+            for rule_pattern, yes in rule.condition:
+                if rule.klass == 'T':
+                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+                        print('good\t' + str(rule_pattern))
+                    else:
+                        print('missing\t' + str(rule_pattern))
+                else:
+                    if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+                        print('buggy\t' + str(rule_pattern))
+            print()
+            break
+
+        if not found:
+            print(str(i) + ' FAIL\t' + str(program))
+            print()
+            unsuggestable[program] += 1
+
+    print('Suggestions and versions in which they were implemented:')
+    for s in suggested:
+        index = len(programs) - (s['passed'] + 1)
+        print(index, [index + m for m in s['matched']])
+    all_suggestions += suggested
+
+    print('-'*30)
+    print()
+
+# report
+not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
+matched = [s for s in all_suggestions if s['passed'] in s['matched']]
+
+# rules that did / did not match in the solution
+good = collections.Counter()
+bad = collections.Counter()
+for s in all_suggestions:
+    (good if s in matched else bad)[s['rule']] += 1
+
+print('Statistics')
+print('----------')
+print('# of suggestions that were implemented:', len(matched))
+print('# of suggestions that were not implemented:', len(not_matched))
+print('avg. # of submissions before suggestion was implemented:',
+        sum(s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS after suggestion was implemented:',
+        sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS if suggestion was not implemented:',
+        sum(s['passed'] for s in not_matched)/len(not_matched))
+#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
+print()
+
+print('Unsuggestable programs')
+print('----------------------')
+for p, count in unsuggestable.most_common():
+    print('{}\t{}'.format(count, p))
+print()
+
+print('Good rules')
+print('----------')
+for r, count in good.most_common():
+    print('Suggested for ' + str(count) + ' submissions')
+    print(r)
+
+print('Bad rules')
+print('---------')
+for r, count in bad.most_common():
+    print('Suggested for ' + str(count) + ' submissions')
+    print(r)
author	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2017-01-09 18:07:23 +0100
committer	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2017-01-09 18:07:23 +0100
commit	27d4458613a5b61f16ad9bf59ca1de460fea3b3a (patch)
tree	1bc98d9672af405d71c915436b0af8001d276ec5