summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--db/__init__.py82
-rw-r--r--db/models.py173
-rw-r--r--monkey/__init__.py0
-rw-r--r--monkey/action.py239
-rw-r--r--monkey/patterns.py511
-rw-r--r--prolog/__init__.py0
-rw-r--r--prolog/lexer.py130
-rw-r--r--prolog/parser.py236
-rw-r--r--prolog/util.py238
-rwxr-xr-xtest-rules.py225
11 files changed, 1836 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ca28708
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+prolog/parsetab.py
diff --git a/db/__init__.py b/db/__init__.py
new file mode 100644
index 0000000..783df6c
--- /dev/null
+++ b/db/__init__.py
@@ -0,0 +1,82 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import threading
+import psycopg2
+
+__all__ = ['get_connection', 'return_connection', 'setup', 'models']
+
+_module_access_lock = threading.Lock()
+
+_host = None # the database hostname/IP
+_port = None # the database port number
+_database = None # the name of the database
+_username = None # the username to access the database
+_password = None # the password to access the database
+
+
+# database parameters setup
+
+def _get_port():
+ try:
+ return int(os.environ.get('CODEQ_DB_PORT'))
+ except:
+ return 5432
+
+
+def setup(
+ host=os.environ.get('CODEQ_DB_HOST') or 'localhost',
+ port=_get_port(),
+ database=os.environ.get('CODEQ_DB_DATABASE') or 'codeq',
+ username=os.environ.get('CODEQ_DB_USER') or 'codeq',
+ password=os.environ.get('CODEQ_DB_PASS') or 'c0d3q'
+):
+ """Sets the database location and authentication parameters."""
+ global _host, _port, _database, _username, _password
+ _host = host
+ _port = port
+ _database = database
+ _username = username
+ _password = password
+
+# connection pooling
+
+_connection_pool = []
+
+
+def get_connection():
+ """Retrieves a database connection from the connection pool."""
+ with _module_access_lock:
+ if _host is None:
+ setup() # lazy init
+ if len(_connection_pool) > 0:
+ return _connection_pool.pop()
+ return psycopg2.connect(host=_host, port=_port, database=_database, user=_username, password=_password)
+
+
+def return_connection(connection):
+ """Returns the given database connection to the pool."""
+ try:
+ connection.rollback() # sanity check
+ except:
+ return
+ with _module_access_lock:
+ _connection_pool.append(connection)
+
+
+if __name__ == '__main__':
+ setup()
diff --git a/db/models.py b/db/models.py
new file mode 100644
index 0000000..9edbec4
--- /dev/null
+++ b/db/models.py
@@ -0,0 +1,173 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import collections
+
+from . import get_connection, return_connection
+
+__all__ = ['CodeqUser', 'Solution']
+
+class CodeqUser(collections.namedtuple('CodeqUser', ['id', 'username', 'password', 'name', 'email', 'is_admin', 'is_active', 'date_joined', 'last_login', 'gui_lang', 'robot_address', 'saml_data', 'gui_layout'])):
+ __sql_prefix = 'select id, username, password, name, email, is_admin, is_active, date_joined, last_login, gui_lang, robot_address, saml_data, gui_layout from codeq_user'
+
+ @staticmethod
+ def get(**kwargs):
+ return _general_get(kwargs, CodeqUser, CodeqUser.__sql_prefix)
+
+ @staticmethod
+ def list():
+ return _general_list(CodeqUser, CodeqUser.__sql_prefix)
+
+ @staticmethod
+ def filter(**kwargs):
+ return _general_filter(kwargs, CodeqUser, CodeqUser.__sql_prefix)
+
+ @staticmethod
+ def solved_problems(user_id, language):
+ return _run_sql('select g.identifier, p.identifier from solution s inner join problem p on p.id = s.problem_id inner join problem_group g on g.id = p.problem_group_id inner join language l on l.id = p.language_id where s.codeq_user_id = %s and l.identifier = %s and s.done = True', (user_id, language), fetch_one=False)
+
+class Problem(collections.namedtuple('Problem', ['id', 'language', 'group', 'identifier'])):
+ __sql_prefix = '''\
+ select p.id, l.identifier, g.identifier, p.identifier
+ from problem p
+ inner join problem_group g on g.id = p.problem_group_id
+ inner join language l on l.id = p.language_id'''
+ __sql_order = 'p.language_id, p.problem_group_id, p.id'
+
+ @staticmethod
+ def get(**kwargs):
+ kwargs = {'p.'+k: v for k, v in kwargs.items()}
+ return _general_get(kwargs, Problem, Problem.__sql_prefix)
+
+ @staticmethod
+ def list():
+ return _general_list(Problem, Problem.__sql_prefix, order=Problem.__sql_order)
+
+ @staticmethod
+ def filter(**kwargs):
+ kwargs = {'p.'+k: v for k, v in kwargs.items()}
+ return _general_filter(kwargs, Problem, Problem.__sql_prefix, order=Problem.__sql_order)
+
+ # get a list of problems with the given language identifier
+ @staticmethod
+ def filter_language(language):
+ kwargs = {'l.identifier': language}
+ return _general_filter(kwargs, Problem, Problem.__sql_prefix, order=Problem.__sql_order)
+
+# known as Attempt in the original code
+class Solution(collections.namedtuple('Solution', ['id', 'done', 'content', 'problem_id', 'codeq_user_id', 'trace'])):
+ __sql_prefix = 'select id, done, content, problem_id, codeq_user_id, trace from solution'
+
+ @staticmethod
+ def get(**kwargs):
+ return _general_get(kwargs, Solution, Solution.__sql_prefix)
+
+ @staticmethod
+ def list():
+ return _general_list(Solution, Solution.__sql_prefix)
+
+ @staticmethod
+ def filter(**kwargs):
+ return _general_filter(kwargs, Solution, Solution.__sql_prefix)
+
+
+def _no_row_conversion(row):
+ return row
+
+def _general_get(kwargs_dict, clazz, sql_select, row_conversion_fn=_no_row_conversion):
+ conditions = []
+ parameters = []
+ for field_name, field_value in kwargs_dict.items():
+ conditions.append(field_name + ' = %s')
+ parameters.append(field_value)
+ if len(conditions) == 0:
+ return None
+ conn = get_connection()
+ try:
+ cur = conn.cursor('crsr1') # a named cursor: scrolling is done on the server
+ cur.arraysize = 1 # scroll unit in the number of rows
+ try:
+ cur.execute(sql_select + ' where ' + ' and '.join(conditions), parameters)
+ row = cur.fetchone()
+ if row:
+ return clazz(*row_conversion_fn(row))
+ return None
+ finally:
+ cur.close()
+ finally:
+ conn.commit()
+ return_connection(conn)
+
+def _general_filter(kwargs_dict, clazz, sql_select, row_conversion_fn=_no_row_conversion, order='id'):
+ conditions = []
+ parameters = []
+ for field_name, field_value in kwargs_dict.items():
+ conditions.append(field_name + ' = %s')
+ parameters.append(field_value)
+ if len(conditions) == 0:
+ return _general_list(clazz, sql_select)
+ conn = get_connection()
+ try:
+ cur = conn.cursor('crsr2') # a named cursor: scrolling is done on the server
+ cur.arraysize = 10000 # scroll unit in the number of rows
+ try:
+ cur.execute(sql_select + ' where ' + ' and '.join(conditions) + ' order by ' + order, parameters)
+ result = []
+ row = cur.fetchone()
+ while row:
+ result.append(clazz(*row_conversion_fn(row)))
+ row = cur.fetchone()
+ return result
+ finally:
+ cur.close()
+ finally:
+ conn.commit()
+ return_connection(conn)
+
+def _general_list(clazz, sql_select, row_conversion_fn=_no_row_conversion, order='id'):
+ conn = get_connection()
+ try:
+ cur = conn.cursor('crsr3') # a named cursor: scrolling is done on the server
+ cur.arraysize = 10000 # scroll unit in the number of rows
+ try:
+ cur.execute(sql_select + ' order by ' + order)
+ result = []
+ row = cur.fetchone()
+ while row:
+ result.append(clazz(*row_conversion_fn(row)))
+ row = cur.fetchone()
+ return result
+ finally:
+ cur.close()
+ finally:
+ conn.commit()
+ return_connection(conn)
+
+def _run_sql(sql, params, fetch_one=False):
+ conn = get_connection()
+ try:
+ cur = conn.cursor()
+ try:
+ cur.execute(sql, params)
+ if fetch_one:
+ return cur.fetchone()
+ else:
+ return cur.fetchall()
+ finally:
+ cur.close()
+ finally:
+ conn.commit()
+ return_connection(conn)
diff --git a/monkey/__init__.py b/monkey/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/monkey/__init__.py
diff --git a/monkey/action.py b/monkey/action.py
new file mode 100644
index 0000000..c2f73ad
--- /dev/null
+++ b/monkey/action.py
@@ -0,0 +1,239 @@
+#!/usr/bin/python3
+
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+
+class Action:
+ def __init__(self, abstime, data):
+ self.type = data['typ']
+ self.time = abstime # time from start
+
+ # generic actions
+ if self.type == 'open':
+ self.timestamp = data['time']
+ elif self.type == 'ins':
+ self.type = 'insert'
+ self.offset = data['off']
+ self.text = data['txt']
+ self.length = len(self.text)
+ elif self.type == 'rm':
+ self.type = 'remove'
+ self.offset = data['off']
+ self.text = data['txt']
+ self.length = len(self.text)
+ elif self.type == 'test':
+ # data['feedback'] is a 'test_results' hint object, or a list
+ # containing such hint
+ hint = None
+ if isinstance(data['feedback'], list):
+ for hint in data['feedback']:
+ if hint['id'] == 'test_results':
+ break
+ else:
+ hint = data['feedback']
+ if hint is not None:
+ self.n_correct = hint['args']['passed']
+ self.n_all = hint['args']['total']
+ else:
+ self.n_correct = self.n_all = None
+ elif self.type == 'hint':
+ self.feedback = data['feedback']
+ elif self.type == 'hnt':
+ # obsolete Prolog hint action, with no additional info
+ self.type = 'hint'
+ self.feedback = None
+ elif self.type == 'plan':
+ self.index = data.get('index')
+
+ # Prolog actions
+ elif self.type == 'prolog_solve':
+ self.query = data['query']
+ elif self.type == 'slva':
+ # obsolete Prolog "solve all" action
+ self.type = 'prolog_solve'
+ self.query = data['qry']
+ elif self.type == 'experiment':
+ self.data = data['data']
+
+ # Python actions
+ elif self.type == 'python_input':
+ self.text = data['txt']
+ elif self.type == 'python_run':
+ self.program = data['program']
+
+ # robot actions
+ elif self.type == 'robot_run':
+ self.program = data['program']
+
+ def __str__(self):
+ s = 't = ' + str(self.time/1000.0) + ' ' + self.type
+ if self.type in {'insert', 'remove'}:
+ s += ': "' + self.text.replace('\n', '\\n').replace('\t', '\\t') + '" at ' + str(self.offset)
+ elif self.type == 'test':
+ s += ': {0} / {1}'.format(self.n_correct, self.n_all)
+ elif self.type == 'hint':
+ if self.feedback is not None:
+ s += ': ' + ', '.join(sorted([hint['id'] for hint in self.feedback]))
+ else:
+ s += ': ?'
+ elif self.type == 'plan':
+ if self.index is not None:
+ s += ': ' + str(self.index)
+ elif self.type == 'prolog_solve':
+ s += ': "' + self.query + '"'
+ elif self.type == 'experiment':
+ s += ': ' + self.data;
+ return s
+
+ # apply this action to text
+ def apply(self, text):
+ if self.type == 'insert':
+ return text[:self.offset] + self.text + text[self.offset:]
+ elif self.type == 'remove':
+ return text[:self.offset] + text[self.offset+self.length:]
+ else:
+ return text
+
+ # reverse the application of this action
+ def unapply(self, text):
+ if self.type == 'insert':
+ return text[:self.offset] + text[self.offset+self.length:]
+ elif self.type == 'remove':
+ return text[:self.offset] + self.text + text[self.offset:]
+ else:
+ return text
+
+# parse log from database into a list of actions, cleaning up some fluff.
+# ignore non-text actions (queries and tests)
+def parse(data):
+ if data == None:
+ return [], []
+
+ actions = []
+ incorrect = set()
+
+ time = 0
+ code = ''
+ for packet in data:
+ try:
+ time += packet['dt']
+ action = Action(time, packet)
+ except:
+ # ignore any errors while decoding a packet
+ sys.stderr.write('Error decoding packet: {}\n'.format(packet))
+ continue
+
+ # skip normalization if this is the first action
+ if actions == []:
+ actions.append(action)
+ code = action.apply(code)
+ continue
+
+ # add to list of actions; modify previously added action if necessary
+ prev = actions[-1]
+
+ # remove superfluous REMOVE action when newline is inserted (due to editor auto-indent)
+ if prev.type == 'remove' and action.type == 'insert' and \
+ action.time == prev.time and \
+ action.offset == prev.offset and action.length > prev.length and \
+ action.text[action.length-prev.length:] == prev.text:
+ # discard last REMOVE action
+ code = prev.unapply(code)
+ actions.pop()
+
+ # replace current action with something better
+ length = action.length - prev.length
+ new = Action(prev.time, {'typ': 'ins', 'off': prev.offset, 'txt': action.text[:length]})
+ actions.append(new)
+ code = new.apply(code)
+
+ # remove superfluous INSERT action when newline is removed (due to editor auto-indent)
+ elif prev.type == 'remove' and action.type == 'insert' and \
+ action.time == prev.time and \
+ action.offset == prev.offset and action.length < prev.length and \
+ prev.text[prev.length-action.length:] == action.text:
+ # discard last INSERT action
+ code = prev.unapply(code)
+ actions.pop()
+
+ # replace current action with something better
+ length = prev.length - action.length
+ new = Action(prev.time, {'typ': 'rem', 'off': prev.offset, 'txt': prev.text[:length]})
+ actions.append(new)
+ code = new.apply(code)
+
+ # otherwise, simply append the current action
+ else:
+ actions.append(action)
+ code = action.apply(code)
+
+ return actions
+
+# expand any multi-char actions (does not do anything for the 99%)
+def expand(actions):
+ i = 0
+ while i < len(actions):
+ if actions[i].type == 'insert' and len(actions[i].text) > 1:
+ a = actions.pop(i)
+ for offset in range(len(a.text)):
+ actions.insert(i+offset,
+ Action(a.time, {'typ': 'ins', 'off': a.offset+offset, 'txt': a.text[offset]}))
+ i += len(a.text)
+ elif actions[i].type == 'remove' and len(actions[i].text) > 1:
+ a = actions.pop(i)
+ for offset in range(len(a.text)):
+ actions.insert(i,
+ Action(a.time, {'typ': 'rm', 'off': a.offset+offset, 'txt': a.text[offset]}))
+ i += len(a.text)
+ else:
+ i += 1
+
+# some sample code
+if __name__ == '__main__':
+ from db.models import Problem, Solution
+
+ # print all problem ids
+ print('problems:')
+ for problem in Problem.list():
+ print(' {}\t{}'.format(problem.id, problem.identifier))
+ print()
+
+ pid = input('enter problem id: ')
+
+ # print all attempt ids for the selected problem
+ print('users solving problem ' + str(pid) + ':')
+ attempts = Solution.filter(problem_id=pid)
+ print(', '.join([str(attempt.codeq_user_id) for attempt in attempts]))
+ print()
+
+ uid = input('enter user id: ')
+ attempt = Solution.get(problem_id=pid, codeq_user_id=uid)
+
+ try:
+ actions = parse(attempt.trace)
+ print('read ' + str(len(actions)) + ' actions from log')
+
+ print('code versions for this attempt:')
+ code = ''
+ for action in actions:
+ code = action.apply(code)
+ print(action)
+ print(code.strip())
+ print()
+ except Exception as ex:
+ sys.stderr.write('Error parsing action log: ' + str(ex))
diff --git a/monkey/patterns.py b/monkey/patterns.py
new file mode 100644
index 0000000..ae958a0
--- /dev/null
+++ b/monkey/patterns.py
@@ -0,0 +1,511 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import collections
+from itertools import combinations
+import pickle
+import random
+import re
+import sys
+
+from nltk import ParentedTree, Tree
+from nltk.tgrep import tgrep_positions
+
+from prolog.util import Token, parse as prolog_parse
+
+# construct pattern to match the structure of nodes given by [include]
+def pattern(node, include):
+ if isinstance(node, Token):
+ if node.type == 'NAME':
+ return '"{}"'.format(node.val)
+ return None
+
+ if any(n is node for n in include):
+ if node.label() == 'variable':
+ return 'variable <: "{}"'.format(node[0].val)
+ return None
+
+ label = node.label()
+ subpats = [pattern(child, include) for child in node]
+
+ if label == 'functor':
+ return '{} <1 ({})'.format(label, subpats[0])
+
+ pat = None
+ if any(subpats):
+ if label == 'and':
+ if subpats[1]:
+ pat = subpats[1]
+ if subpats[0]:
+ if pat:
+ pat = subpats[0] + ' .. (' + pat + ')'
+ else:
+ pat = subpats[0]
+ elif label == 'args':
+ pat = label
+ for i, subpat in enumerate(subpats):
+ if subpat:
+ pat += ' <{} ({})'.format(i+1, subpat)
+ elif label == 'binop':
+ pat = label
+ if subpats[0]:
+ pat += ' <1 ({})'.format(subpats[0])
+ pat += ' <2 ("{}")'.format(node[1].val)
+ if subpats[2]:
+ pat += ' <3 ({})'.format(subpats[2])
+ elif label == 'clause':
+ pat = label
+ for i, subpat in enumerate(subpats):
+ if subpat:
+ pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i])
+ elif label == 'compound':
+ if len(subpats) > 1 and subpats[1]:
+ pat = label
+ for i, subpat in enumerate(subpats):
+ pat += ' <{} ({})'.format(i+1, subpat)
+ else:
+ return None
+ elif label == 'head':
+ pat = label
+ pat += ' <1 ({})'.format(subpats[0])
+ if not pat:
+ for s in subpats:
+ if s:
+ pat = s
+ break
+ return pat
+
+# construct a pattern connecting (variable) node [a] to [b]
+def connect(a, b):
+ path_a = []
+ node = a
+ while node.parent():
+ node = node.parent()
+ path_a.insert(0, node)
+
+ path_b = []
+ node = b
+ while node.parent():
+ node = node.parent()
+ path_b.insert(0, node)
+
+ common_ancestor = None
+ for i in range(min(len(path_a), len(path_b))):
+ if path_a[i] is not path_b[i]:
+ break
+ if path_a[i].label() in {'compound', 'head'}:
+ break
+ common_ancestor = path_a[i]
+
+ def node_label(node):
+ if node.label() == 'compound':
+ right = node[1]
+ nargs = 1
+ while right._label == "args" and len(right) == 2:
+ right = right[1]
+ nargs += 1
+ return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs)
+ if node.label() == 'binop':
+ return 'binop <2 "{}"'.format(node[1].val)
+ return node.label()
+ i = 0
+ while path_a[i].label() != 'clause':
+ i += 1
+
+ # path from top to common ancestor
+ pat = path_a[i].label()
+ i += 1
+ n_top = 0
+ while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]:
+ node = path_a[i]
+ i += 1
+ if node.label() == 'and':
+ continue
+ if node.parent().label() == 'and':
+ op = '<+(and)'
+ else:
+ op = '<{}'.format(node.parent_index()+1)
+ pat += ' {} ({}'.format(op, node_label(node))
+ n_top += 1
+
+ path_a = path_a[i:]
+ path_b = path_b[i:]
+
+ # path from common ancestor to [a]
+ n_a = 0
+ for node in path_a:
+ if node.label() == 'and':
+ continue
+ op = '<'
+ if node.parent().label() == 'and':
+ op = '<+(and)'
+ elif node.parent_index() is not None:
+ op = '<{}'.format(node.parent_index()+1)
+ pat += ' {} ({}'.format(op, node_label(node))
+ n_a += 1
+ pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val)
+ pat += ')' * n_a
+
+ # path from common ancestor to [b]
+ n_b = 0
+ for node in path_b:
+ if node.label() == 'and':
+ continue
+ op = '<'
+ if node.parent().label() == 'and':
+ op = '<+(and)'
+ elif node.parent_index() is not None:
+ op = '<{}'.format(node.parent_index()+1)
+ pat += ' {} ({}'.format(op, node_label(node))
+ n_b += 1
+ pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val)
+
+ pat += ')' * (n_top + n_b)
+ return pat
+
+# replace variable names with patterns and backlinks
+def postprocess(pattern):
+ macros = '@ VAR /[A-Z]/; '
+ #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern)
+ m = re.search(r'"[A-Z]_[0-9]+"', pattern)
+ nvars = 0
+ while m is not None:
+ orig_name = m.group(0)
+ n = orig_name.strip('"').split("_")[1]
+ pat_name = 'v{}'.format(nvars)
+ nvars += 1
+ pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():]
+ for m in re.finditer(orig_name, pattern):
+ pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():]
+ m = re.search(r'"([A-Z]*_[0-9]+)"', pattern)
+ return macros + pattern
+
+def postprocess_simple(pattern):
+ pattern = postprocess(pattern)
+ if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"):
+ return None
+ #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern)
+ relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern)
+
+ # elements
+ elements = []
+ current = ""
+ i = 0
+ par = 0
+ while i < len(pattern):
+ if par > 0 and pattern[i] == ")":
+ par -= 1
+ if par == 0:
+ elements.append(current)
+ current = ""
+ if par > 0 and pattern[i] == "(":
+ par += 1
+ if par == 0 and \
+ (pattern[i:].startswith("(head") or
+ pattern[i:].startswith("(compound") or
+ pattern[i:].startswith("(binop")):
+ par = 1
+ if par > 0:
+ current += pattern[i]
+ i += 1
+ # simplify variable
+ for ei, e in enumerate(elements):
+ # #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e)
+ elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")"
+ elements = sorted(elements)
+ #print(pattern)
+ #print(relevant)
+ #return "_".join(relevant)#pattern
+ return " ".join(elements)#pattern
+
+# construct pattern to match the structure of nodes given by [include],
+# supports variables and literals
+def pattern2(node, include):
+ if isinstance(node, Token):
+ return None
+
+ label = node.label()
+ if any(n is node for n in include):
+ if label == 'literal':
+ return '"{}"'.format(node[0].val)
+ if label == 'variable':
+ return '{}'.format(label)
+ return None
+ if label == 'functor':
+ return '({} "{}")'.format(label, node[0].val)
+
+ subpats = [pattern2(child, include) for child in node]
+ pat = None
+ if any(subpats):
+ if label == 'and':
+ if subpats[1]:
+ pat = subpats[1]
+ if subpats[0]:
+ if pat:
+ pat = subpats[0] + ' ' + pat
+ else:
+ pat = subpats[0]
+ elif label == 'args':
+ pat = label
+ for i, subpat in enumerate(subpats):
+ if subpat:
+ pat += ' {}'.format(subpat)
+ pat = '(' + pat + ')'
+ elif label == 'unop':
+ pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')'
+ elif label == 'binop':
+ pat = label
+ if subpats[0]:
+ pat += ' {}'.format(subpats[0])
+ pat += ' "{}"'.format(node[1].val)
+ if subpats[2]:
+ pat += ' {}'.format(subpats[2])
+ pat = '(' + pat + ')'
+ elif label == 'clause':
+ pat = label
+ for i, subpat in enumerate(subpats):
+ if subpat:
+ pat += ' {}'.format(subpats[i])
+ return '(' + pat + ')'
+ elif label == 'compound':
+ if len(subpats) > 1 and subpats[1]:
+ pat = label
+ for i, subpat in enumerate(subpats):
+ pat += ' {}'.format(subpat)
+ pat = '(' + pat + ')'
+ else:
+ return None
+ elif label == 'head':
+ pat = label
+ pat += ' {}'.format(subpats[0])
+ pat = '(' + pat + ')'
+ elif label == 'list':
+ pat = 'list '
+ if subpats[0]:
+ pat += '(h {})'.format(subpats[0])
+ if subpats[0] and subpats[1]:
+ pat += ' '
+ if subpats[1]:
+ pat += '(t {})'.format(subpats[1])
+ pat = '(' + pat + ')'
+ if not pat:
+ for s in subpats:
+ if s:
+ pat = s
+ break
+ return pat
+
+def get_patterns(tree):
+ orig = tree
+ if isinstance(tree, str):
+ tree = prolog_parse(tree)
+ if tree is None:
+ return
+ tree = ParentedTree.convert(tree)
+
+ # get patterns separately for each clause
+ for clause in tree:
+ all_variables = []
+ variables = collections.defaultdict(list)
+ def walk(node):
+ if isinstance(node, Tree):
+ if node.label() == 'variable':
+ name = node[0].val
+ variables[name].append(node)
+ all_variables.append(node)
+ else:
+ for child in node:
+ walk(child)
+ walk(clause)
+
+ # connecting pairs of nodes with same variable
+ for var, nodes in variables.items():
+ for selected in combinations(nodes, 2):
+ pat = pattern2(clause, selected)
+ if pat:
+ print(pat)
+ yield pat, selected
+ #pat = connect(*selected)
+ #if pat:
+ # pp = postprocess_simple(pat)
+ # if pp:
+ # yield pp, selected
+
+ # add singletons
+ for var, nodes in variables.items():
+ if len(nodes) == 1:
+ pat = pattern2(clause, nodes)
+ if pat:
+ print(pat)
+ yield pat, nodes
+ #pat = pattern(tree, var)
+ #if pat:
+ # pp = postprocess_simple(pat)
+ # if pp:
+ # yield pp, nodes
+
+ # add patterns for literal/variable pairs
+ literals = [node for node in clause.subtrees() if node.label() == 'literal']
+ for literal in literals:
+ top = literal
+ while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}:
+ top = top.parent()
+ variables = [node for node in top.subtrees() if node.label() == 'variable']
+ for var in variables:
+ pat = pattern2(clause, [var, literal])
+ if pat:
+ yield pat, [var, literal]
+
+# # connecting pairs of nodes with variables
+# for selected in combinations(all_variables, 2):
+# pat = connect(selected[0], selected[1])
+# if pat:
+# yield postprocess(pat), selected
+
+# # using pairs of nodes with variables
+# for selected in combinations(all_variables, 2):
+# pat = pattern(tree, selected)
+# if pat:
+# yield postprocess(pat), selected
+
+# # using pairs of nodes with same variable
+# for var, nodes in variables.items():
+# for selected in combinations(nodes, 2):
+# pat = pattern(tree, selected)
+# if pat:
+# yield postprocess(pat), selected
+
+# # using each variable separately
+# for var, nodes in variables.items():
+# pat = pattern(tree, nodes)
+# if pat:
+# yield postprocess(pat), nodes
+
+# # using each goal to select variables FIXME
+# goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}]
+# for goal in goals:
+# goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'}
+# pat = pattern(tree, goal_vars)
+# if pat:
+# yield postprocess(pat)
+
+# nltk.tgrep does not play nice with non-Tree leaves
+def _tgrep_prepare(tree):
+ if not isinstance(tree, ParentedTree):
+ tree = ParentedTree.convert(tree)
+ def prepare(node):
+ if isinstance(node, Token) or isinstance(node, str):
+ return ParentedTree(str(node), [])
+ return ParentedTree(node.label(), [prepare(child) for child in node])
+ return prepare(tree)
+
+def find_motif(tree, motif):
+ tree = _tgrep_prepare(tree)
+ return tgrep_positions(motif, [tree])
+
+# Extract edits and other data from existing traces for each problem.
+# Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle>
+if __name__ == '__main__':
+ # Ignore traces from these users.
+ ignored_users = [
+ 1, # admin
+ 231, # test
+ 360, # test2
+ 358, # sasha
+ ]
+
+ pid = int(sys.argv[1])
+ name = sys.argv[2]
+ submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))
+
+ print('Analyzing programs for problem {}…'.format(pid))
+ ndata = {
+ 'train': [],
+ 'test': []
+ }
+ # get all users
+ users = set()
+ for code, data in submissions.items():
+ users |= data['users']
+ users = list(users)
+ users.sort()
+ random.Random(0).shuffle(users)
+ split = int(len(users)*0.7)
+ learn_users = set(users[:split])
+ test_users = set(users[split:])
+ print(test_users)
+
+ for code, data in submissions.items():
+ if len(code) > 1000 or prolog_parse(code) is None:
+ continue
+ if name not in code:
+ continue
+ ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users)
+ ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users)
+ #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'])
+
+ print('correct: {} ({} unique)'.format(
+ len([code for code, correct in ndata['train'] if correct]),
+ len({code for code, correct in ndata['train'] if correct})))
+ print('incorrect: {} ({} unique)'.format(
+ len([code for code, correct in ndata['train'] if not correct]),
+ len({code for code, correct in ndata['train'] if not correct})))
+
+ #iprograms.sort()
+ #random.Random(0).shuffle(programs)
+ #split = int(len(programs)*0.7)
+ #data = {
+ # 'train': programs[:split],
+ # 'test': programs[split:],
+ #}
+ data = ndata
+
+ # extract attributes from training data
+ patterns = collections.Counter()
+ for code, correct in data['train']:
+ for pat, nodes in get_patterns(code):
+ patterns[pat] += 1
+
+ attrs = []
+ with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file:
+ for i, (pat, count) in enumerate(patterns.most_common()):
+ if count < 1:
+ break
+ attrs.append(pat)
+ print('a{}\t{}'.format(i, pat), file=pattern_file)
+
+ # check and write attributes for training/test data
+ for t in ['train', 'test']:
+ with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f:
+ # print header
+ print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
+ print('\t'.join(['d'] * (len(attrs)+2)), file=f)
+ print('meta\tclass', file=f)
+ for code, correct in data[t]:
+ record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')
+
+ ## check attributes by using tgrep to find patterns
+ #tree = _tgrep_prepare(prolog_parse(code))
+ #for pat in attrs:
+ # matches = list(tgrep_positions(pat, [tree]))
+ # record += '\t{}'.format(bool(matches[0]))
+
+ # check attributes by enumerating patterns in code
+ code_pats = [pat for pat, nodes in get_patterns(code)]
+ for pat in attrs:
+ record += '\t{}'.format('T' if pat in code_pats else 'F')
+
+ print(record, file=f)
diff --git a/prolog/__init__.py b/prolog/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/prolog/__init__.py
diff --git a/prolog/lexer.py b/prolog/lexer.py
new file mode 100644
index 0000000..5023e57
--- /dev/null
+++ b/prolog/lexer.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import ply.lex as lex
+
+# LEXER
+
+#states = (
+# ('comment', 'exclusive'),
+#)
+
+# tokens; treat operators as names if followed by (
+operators = {
+ r':-': 'FROM',
+ r'-->': 'FROMDCG',
+ r'->': 'IMPLIES',
+ r'\+': 'NOT',
+ r'=': 'EQU',
+ r'\=': 'NEQU',
+ r'==': 'EQ',
+ r'\==': 'NEQ',
+ r'=..': 'UNIV',
+ r'is': 'IS',
+ r'=:=': 'EQA',
+ r'=\=': 'NEQA',
+ r'<': 'LT',
+ r'=<': 'LE',
+ r'>': 'GT',
+ r'>=': 'GE',
+ r'@<': 'LTL',
+ r'@=<': 'LEL',
+ r'@>': 'GTL',
+ r'@>=': 'GEL',
+ r'#=': 'EQFD',
+ r'#\=': 'NEQFD',
+ r'#<': 'LTFD',
+ r'#=<': 'LEFD',
+ r'#>': 'GTFD',
+ r'#>=': 'GEFD',
+ r'in': 'IN',
+ r'ins': 'INS',
+ r'..': 'THROUGH',
+ r'+': 'PLUS',
+ r'-': 'MINUS',
+ r'*': 'STAR',
+ r'/': 'DIV',
+ r'//': 'IDIV',
+ r'mod': 'MOD',
+ r'**': 'POW',
+ r'^': 'POW',
+ r'.': 'PERIOD',
+ r',': 'COMMA',
+ r';': 'SEMI'
+}
+tokens = sorted(list(operators.values())) + [
+ 'UINTEGER', 'UREAL',
+ 'NAME', 'VARIABLE', 'STRING',
+ 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'PIPE', 'LBRACE', 'RBRACE',
+ 'INVALID'
+]
+
+# punctuation
+t_LBRACKET = r'\['
+t_RBRACKET = r'\]'
+t_LPAREN = r'\('
+t_RPAREN = r'\)'
+t_PIPE = r'\|'
+t_LBRACE = r'{'
+t_RBRACE = r'}'
+
+t_UINTEGER = r'[0-9]+'
+t_UREAL = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan'
+t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*'
+t_STRING = r'"(""|\\.|[^\"])*"'
+
+# no support for nested comments yet
+def t_comment(t):
+ r'(/\*(.|\n)*?\*/)|(%.*)'
+ pass
+
+def t_NAME(t):
+ r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|,"
+ if t.value == ',' or \
+ t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(':
+ t.type = operators.get(t.value, 'NAME')
+ return t
+
+t_ignore = ' \t'
+
+def t_newline(t):
+ r'\n+'
+ t.lexer.lineno += len(t.value)
+
+def t_error(t):
+ # TODO send this to stderr
+ #print("Illegal character '" + t.value[0] + "'")
+ t.type = 'INVALID'
+ t.value = t.value[0]
+ t.lexer.skip(1)
+ return t
+
+lexer = lex.lex(errorlog=lex.NullLogger())
+
+if __name__ == '__main__':
+ while True:
+ try:
+ s = input('> ')
+ except EOFError:
+ break
+ if not s:
+ continue
+
+ lexer.input(s)
+ tokens = list(lexer)
+ print(tokens)
diff --git a/prolog/parser.py b/prolog/parser.py
new file mode 100644
index 0000000..4d4f9e9
--- /dev/null
+++ b/prolog/parser.py
@@ -0,0 +1,236 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from nltk import Tree
+import ply.yacc as yacc
+from .lexer import tokens
+from .util import Token
+
+# PARSER
+precedence = (
+ ('nonassoc', 'FROM', 'FROMDCG'),
+ ('right', 'PIPE'),
+ ('right', 'IMPLIES'),
+ ('right', 'NOT'),
+ ('nonassoc', 'EQU', 'NEQU', 'EQ', 'NEQ', 'UNIV', 'IS', 'EQA', 'NEQA', 'LT', 'LE', 'GT', 'GE', 'LTL', 'LEL', 'GTL', 'GEL', 'IN', 'INS', 'THROUGH', 'EQFD', 'NEQFD', 'LTFD', 'LEFD', 'GTFD', 'GEFD'),
+ ('left', 'PLUS', 'MINUS'),
+ ('left', 'STAR', 'DIV', 'IDIV', 'MOD'),
+ ('nonassoc', 'POW'),
+ ('right', 'UMINUS', 'UPLUS'),
+ ('nonassoc', 'UINTEGER', 'UREAL'),
+ ('nonassoc', 'NAME', 'VARIABLE', 'STRING'),
+ ('nonassoc', 'PERIOD'),
+ ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE')
+)
+
+def make_token(p, n):
+ lextoken = p.slice[n]
+ return Token(lextoken.type, lextoken.value, lextoken.lexpos)
+
+def p_text_empty(p):
+ 'text : '
+ p[0] = Tree('text', [])
+def p_text_clause(p):
+ 'text : text clause'
+ p[0] = p[1]
+ p[0].append(p[2])
+
+def p_clause_head(p):
+ 'clause : head PERIOD'
+ p[0] = Tree('clause', [p[1]])
+def p_clause_rule(p):
+ '''clause : head FROM or PERIOD
+ | head FROMDCG or PERIOD'''
+ p[0] = Tree('clause', [p[1], p[3]])
+
+def p_head(p):
+ 'head : term'
+ p[0] = Tree('head', [p[1]])
+
+def p_or_single(p):
+ 'or : if'
+ p[0] = p[1]
+def p_or_if(p):
+ 'or : or SEMI if'
+ p[0] = Tree('or', [p[1], p[3]])
+
+def p_if_single(p):
+ 'if : and'
+ p[0] = p[1]
+def p_if_and(p):
+ 'if : and IMPLIES if'
+ p[0] = Tree('if', [p[1], p[3]])
+
+def p_and_single(p):
+ 'and : term'
+ p[0] = p[1]
+def p_and_term(p):
+ 'and : term COMMA and'
+ p[0] = Tree('and', [p[1], p[3]])
+
+# Special case for zero-arity predicates supported by SWI-Prolog.
+def p_term_functor_zero(p):
+ 'term : functor LPAREN RPAREN'
+ # No whitespace allowed between functor and LPAREN.
+ t2 = make_token(p, 2)
+ if p[1][0].pos + len(p[1][0].val) < t2.pos:
+ raise SyntaxError('whitespace before ' + str(t2))
+ p[0] = Tree('compound', [p[1]])
+def p_term_functor(p):
+ 'term : functor LPAREN args RPAREN'
+ # No whitespace allowed between functor and LPAREN.
+ t2 = make_token(p, 2)
+ if p[1][0].pos + len(p[1][0].val) < t2.pos:
+ raise SyntaxError('whitespace before ' + str(t2))
+ p[0] = Tree('compound', [p[1], p[3]])
+
+def p_term_or(p):
+ 'term : LPAREN or RPAREN'
+ p[0] = p[2]
+def p_term_binary(p):
+ '''term : term PLUS term
+ | term MINUS term
+ | term STAR term
+ | term POW term
+ | term DIV term
+ | term IDIV term
+ | term MOD term
+
+ | term EQU term
+ | term NEQU term
+ | term EQ term
+ | term NEQ term
+ | term UNIV term
+ | term IS term
+
+ | term EQA term
+ | term NEQA term
+ | term LT term
+ | term LE term
+ | term GT term
+ | term GE term
+
+ | term LTL term
+ | term LEL term
+ | term GTL term
+ | term GEL term
+
+ | term PIPE term
+ | term THROUGH term
+ | term IN term
+ | term INS term
+ | term EQFD term
+ | term NEQFD term
+ | term LTFD term
+ | term LEFD term
+ | term GTFD term
+ | term GEFD term'''
+ p[0] = Tree('binop', [p[1], make_token(p, 2), p[3]])
+def p_term_unary(p):
+ '''term : NOT term
+ | MINUS term %prec UMINUS
+ | PLUS term %prec UPLUS'''
+ p[0] = Tree('unop', [make_token(p, 1), p[2]])
+def p_term_list(p):
+ 'term : list'
+ p[0] = p[1]
+
+def p_term_variable(p):
+ 'term : VARIABLE'
+ p[0] = Tree('variable', [make_token(p, 1)])
+def p_term_simple(p):
+ '''term : STRING
+ | UINTEGER
+ | UREAL'''
+ p[0] = Tree('literal', [make_token(p, 1)])
+def p_term_name(p):
+ 'term : NAME'
+ p[0] = make_token(p, 1)
+
+def p_term_clpr(p):
+ 'term : LBRACE clpr RBRACE'
+ p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
+
+# compound term arguments
+def p_args_single(p):
+ 'args : term'
+ p[0] = Tree('args', [p[1]])
+def p_args_multiple(p):
+ 'args : term COMMA args'
+ p[0] = Tree('args', [p[1], p[3]])
+
+# list elements
+def p_elems_single(p):
+ 'elems : term'
+ if isinstance(p[1], Tree) and p[1].label() == 'binop' and p[1][1].type == 'PIPE':
+ p[0] = Tree('list', [p[1][0]])
+ if p[1][2] != Tree('term', [Token(type='NIL', val='[]')]):
+ p[0].append(p[1][2])
+ else:
+ p[0] = Tree('list', [p[1], Token(type='NIL', val='[]')])
+def p_elems_multiple(p):
+ 'elems : term COMMA elems'
+ p[0] = Tree('list', [p[1], p[3]])
+
+def p_list_empty(p):
+ 'list : LBRACKET RBRACKET'
+ p[0] = Tree('literal', [Token(type='NIL', val='[]')])
+def p_list(p):
+ 'list : LBRACKET elems RBRACKET'
+ p[0] = p[2]
+
+def p_functor(p):
+ 'functor : NAME'
+ p[0] = Tree('functor', [make_token(p, 1)])
+
+# CLP(R) syntax
+def p_clpr_single(p):
+ 'clpr : clpr_constr'
+ p[0] = Tree('clpr', [p[1]])
+def p_clpr_more(p):
+ '''clpr : clpr_constr COMMA clpr
+ | clpr_constr SEMI clpr'''
+ p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]])
+# XXX temporary until the new parser is in place, this also covers { } notation for DCGs
+def p_clpr_constr(p):
+ 'clpr_constr : term'
+ p[0] = p[1]
+
+def p_error(t):
+ if t is None:
+ raise SyntaxError('unexpected end of file')
+ else:
+ raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value))
+
+parser = yacc.yacc(debug=False)
+
+if __name__ == '__main__':
+ from .util import stringify
+ while True:
+ try:
+ s = input('> ')
+ except EOFError:
+ break
+ if not s:
+ continue
+ ast = parser.parse(s)
+ def pp(node):
+ if isinstance(node, Tree):
+ return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')'
+ return '"' + str(node) + '"'
+ print(pp(ast))
+ #ast.pretty_print()
+ #print(stringify(ast))
diff --git a/prolog/util.py b/prolog/util.py
new file mode 100644
index 0000000..402b920
--- /dev/null
+++ b/prolog/util.py
@@ -0,0 +1,238 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from collections import namedtuple
+from collections.abc import Iterable
+import string
+
+from nltk import Tree
+
+# Stores a token's type and value, and optionally the position of the first
+# character in the lexed stream.
+class Token(namedtuple('Token', ['type', 'val', 'pos'])):
+ __slots__ = ()
+
+ # Custom constructor to support default parameters.
+ def __new__(cls, type, val='', pos=None):
+ return super(Token, cls).__new__(cls, type, val, pos)
+
+ def __str__(self):
+ return str(self.val)
+
+ # Only consider type and value when comparing tokens. There is probably a
+ # cleaner way of doing this.
+ __eq__ = lambda x, y: x[0] == y[0] and x[1] == y[1]
+ __ne__ = lambda x, y: x[0] != y[0] or x[1] != y[1]
+ __lt__ = lambda x, y: tuple.__lt__(x[0:2], y[0:2])
+ __le__ = lambda x, y: tuple.__le__(x[0:2], y[0:2])
+ __ge__ = lambda x, y: tuple.__ge__(x[0:2], y[0:2])
+ __gt__ = lambda x, y: tuple.__gt__(x[0:2], y[0:2])
+
+ # Only hash token's value (we don't care about position, and types are
+ # determined by values).
+ def __hash__(self):
+ return hash(self[1])
+
+ # Return a copy of this token, possibly modifying some fields.
+ def clone(self, type=None, val=None, pos=None):
+ return Token(self.type if type is None else type,
+ self.val if val is None else val,
+ self.pos if pos is None else pos)
+
+from .lexer import lexer, operators
+from .parser import parser
+
+def parse(code):
+ try:
+ return parser.parse(code)
+ except SyntaxError:
+ return None
+
+# Return a list of tokens in [text].
+def tokenize(text):
+ lexer.input(text)
+ return [Token(t.type, t.value, t.lexpos) for t in lexer]
+
+# Return a one-line string representation of [obj] which may be a Tree or a
+# list of tokens.
+def stringify(obj):
+ if isinstance(obj, Token):
+ if obj.type in ('PERIOD', 'COMMA'):
+ return str(obj) + ' '
+ if obj.type in operators.values():
+ return ' ' + str(obj) + ' '
+ return str(obj)
+ if isinstance(obj, Iterable):
+ if isinstance(obj, Tree) and obj.label() == 'clause':
+ return ''.join([stringify(child) for child in obj]) + '\n'
+ return ''.join([stringify(child) for child in obj])
+
+# Return a canonical name for the [n]th variable in scope.
+def canonical_varname(n):
+ names = string.ascii_uppercase
+ if n < len(names):
+ return names[n]
+ return 'X{}'.format(n)
+
+# Rename variables in [tokens] to A0, A1, A2,… in order of appearance.
+def rename_vars_list(tokens, names=None):
+ if names is None:
+ names = {}
+ next_id = len(names)
+
+ # Return a new list.
+ tokens = list(tokens)
+ for i, t in enumerate(tokens):
+ if t.type == 'VARIABLE' and t.val != '_':
+ cur_name = t.val
+ if cur_name not in names:
+ names[cur_name] = canonical_varname(next_id)
+ next_id += 1
+ tokens[i] = t.clone(val=names[cur_name])
+ return tokens
+
+# Rename variables in AST rooted at [root] to A0, A1, A2,… in order of
+# appearance.
+def rename_vars_ast(root, fixed_names=None):
+ if fixed_names is None:
+ fixed_names = {}
+ names = {}
+ next_id = len(fixed_names) + len(names)
+
+ def rename_aux(node):
+ nonlocal fixed_names, names, next_id
+ if isinstance(node, Tree):
+ if node.label() == 'clause':
+ names = {}
+ next_id = len(fixed_names) + len(names)
+ new_children = [rename_aux(child) for child in node]
+ new_node = Tree(node.label(), new_children)
+ elif isinstance(node, Token):
+ if node.type == 'VARIABLE':
+ token = node
+ if token.val.startswith('_'):
+ new_node = token.clone(val=canonical_varname(next_id))
+ next_id += 1
+ else:
+ cur_name = token.val
+ if cur_name in fixed_names:
+ new_name = fixed_names[cur_name]
+ else:
+ if cur_name not in names:
+ names[cur_name] = canonical_varname(next_id)
+ next_id += 1
+ new_name = names[cur_name]
+ new_node = token.clone(val=new_name)
+ else:
+ new_node = node
+ return new_node
+ return rename_aux(root)
+
+# Yield "interesting" parts of a Prolog AST as lists of tokens.
+def interesting_ranges(ast, path=()):
+ if ast.label() in {'clause', 'head', 'or', 'if', 'and'}:
+ if ast.label() == 'clause':
+ # Special case for clause with one goal.
+ if len(ast) == 4 and ast[2].label() == 'term':
+ terminals = ast[2].leaves()
+ yield terminals, path + (ast.label(), 'and')
+
+ if ast.label() == 'and':
+ for i in range(0, len(ast), 2):
+ for j in range(i, len(ast), 2):
+ subs = ast[i:j+1]
+ terminals = []
+ for s in subs:
+ terminals.extend([s] if isinstance(s, Token) else s.leaves())
+ # We want at least some context.
+ if len(terminals) > 1:
+ yield terminals, path + (ast.label(),)
+ else:
+ terminals = ast.leaves()
+ # We want at least some context.
+ if len(terminals) > 1:
+ yield terminals, path + (ast.label(),)
+
+ for subtree in ast:
+ if isinstance(subtree, Tree):
+ yield from interesting_ranges(subtree, path + (ast.label(),))
+
+# Map "formal" variable names in the edit a→b to actual names in code [tokens].
+# The set [variables] contains all variable names in the current scope. These
+# are used in cases such as [A]→[A,B], where the edit introduces new variables.
+# Return a new version of b with actual variable names.
+def map_vars(a, b, tokens, variables):
+ mapping = {}
+ new_index = 0
+ for i in range(len(a)):
+ if tokens[i].type == 'VARIABLE':
+ formal_name = a[i].val
+ if tokens[i].val != '_':
+ actual_name = tokens[i].val
+ else:
+ actual_name = 'New'+str(new_index)
+ new_index += 1
+ mapping[formal_name] = actual_name
+
+ remaining_formal = [t.val for t in b if t.type == 'VARIABLE' and t.val not in mapping.keys()]
+ remaining_actual = [var for var in variables if var not in mapping.values()]
+
+ while len(remaining_actual) < len(remaining_formal):
+ remaining_actual.append('New'+str(new_index))
+ new_index += 1
+
+ for i, formal_name in enumerate(remaining_formal):
+ mapping[formal_name] = remaining_actual[i]
+
+ return [t if t.type != 'VARIABLE' else t.clone(val=mapping[t.val]) for t in b]
+
+# Return a set of predicate names (e.g. conc/3) used in [program].
+def used_predicates(program):
+ predicates = set()
+ def walk(tree, dcg=False):
+ if isinstance(tree, Tree):
+ # DCG predicates can be called without parameters
+ if tree.label() == 'clause' and len(tree) == 4 and \
+ tree[1].type == 'FROMDCG':
+ dcg = True
+ if tree.label() == 'term' and len(tree) >= 3 and \
+ isinstance(tree[0], Tree) and tree[0].label() == 'functor':
+ if len(tree) == 3:
+ predicates.add('{}/0'.format(tree[0][0]))
+ else:
+ predicates.add('{}/{}'.format(tree[0][0], (len(tree[2])+1)//2))
+ for subtree in tree:
+ walk(subtree, dcg)
+ elif isinstance(tree, Token):
+ if dcg and tree.type == 'NAME':
+ predicates.add('{}/{}'.format(tree.val, 2))
+ predicates.add('{}/{}'.format(tree.val, 3))
+ predicates.add('{}/{}'.format(tree.val, 4))
+ tree = parse(program)
+ if tree is not None:
+ walk(tree)
+ return predicates
+
+# Basic sanity check.
+if __name__ == '__main__':
+ var_names = {}
+ before = rename_vars(tokenize("dup([A0|A1], [A2|A3])"), var_names)
+ after = rename_vars(tokenize("dup([A0|A1], [A5, A4|A3])"), var_names)
+
+ line = lines[0]
+ variables = [t.val for t in tokenize(code) if t.type == 'VARIABLE']
+ mapped = map_vars(before, after, line, variables)
+ print(mapped)
diff --git a/test-rules.py b/test-rules.py
new file mode 100755
index 0000000..bb89e7e
--- /dev/null
+++ b/test-rules.py
@@ -0,0 +1,225 @@
+#!/usr/bin/python3
+
+import collections
+import os.path
+import pickle
+import re
+from statistics import mean
+import sys
+
+from termcolor import colored
+
+from monkey.action import parse as parse_trace
+from monkey.patterns import get_patterns
+from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize
+
+# script arguments
+solutions_file = sys.argv[1]
+pid = int(sys.argv[2])
+data_dir = sys.argv[3]
+
+attributes_file = os.path.join(data_dir, 'attributes')
+rules_file = os.path.join(data_dir, 'rules')
+users_file = os.path.join(data_dir, 'users-test')
+programs_file = os.path.join(data_dir, 'programs.pickle')
+
+# read test results for known programs
+test = pickle.load(open(programs_file, 'rb'))
+
+# read traces
+users = [int(line.strip()) for line in open(users_file, 'r').readlines()]
+traces = {}
+for solution in pickle.load(open(solutions_file, 'rb')):
+ if solution.problem_id == pid and solution.codeq_user_id in users:
+ traces[solution.codeq_user_id] = solution.trace
+
+# read attributes
+attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()])
+
+class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])):
+ def __str__(self):
+ s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality)
+ s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition])
+ return s
+
+# read rules
+rules = []
+for line in open(rules_file, 'r').readlines():
+ match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip())
+ if match:
+ m = tuple(match.groups())
+ condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND '))
+ rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1])))
+ #print(rules[-1])
+ else:
+ print('Did not understand rule:', line.strip())
+
+def color_print(text, ranges):
+ i = 0
+ for start, length, color in sorted(ranges):
+ # ignore overlapping ranges
+ if start < i:
+ continue
+ print(text[i:start], end='')
+ print(colored(text[start:start+length], color), end='')
+ i = start + length
+ print(text[i:])
+
+# check if given patterns match the rule
+def check_rule(rule, patterns):
+ ret_patterns = []
+ for rule_pattern, yes in rule.condition:
+ if yes:
+ # this pattern must be present
+ for pattern, nodes in patterns:
+ if pattern == rule_pattern:
+ ret_patterns.append((rule_pattern, nodes))
+ else:
+ # this pattern must not be present
+ if rule_pattern in [p[0] for p in patterns]:
+ return []
+ return ret_patterns
+
+# keep track of when each suggestion was applied
+all_suggestions = []
+# programs with no matching rule
+unsuggestable = collections.Counter()
+
+for user, trace in traces.items():
+ # get submissions from trace
+ programs = []
+ code = ''
+ for action in parse_trace(trace):
+ code = action.apply(code)
+ if action.type == 'test':
+ if prolog_parse(code) is None:
+ continue
+ normalized_code = stringify(rename_vars_list(tokenize(code)))
+ if programs and normalized_code == programs[-1][0]:
+ continue
+ correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed']
+ programs.append((normalized_code, correct))
+ # ignore actions after first correct submission
+ if correct:
+ break
+
+ # ignore traces with no / only correct submissions
+ if not any(p[1] for p in programs) or all(p[1] for p in programs):
+ continue
+
+ suggested = []
+ for i, (program, correct) in enumerate(programs):
+ program_patterns = list(get_patterns(program))
+ #for p in program_patterns:
+ # print(p[0])
+ #print()
+
+ # check if previously suggested rules match
+ for s in suggested:
+ s['passed'] += 1
+ match = check_rule(s['rule'], program_patterns)
+ if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or
+ s['rule'].klass == 'F' and not match):
+ s['matched'].append(s['passed'])
+
+ # only check programs until first correct submission
+ if correct:
+ print(str(i) + ' PASS\t' + program)
+ print()
+ break
+
+ # check rules in order, buggy rules first
+ found = False
+ for rule in (
+ [r for r in rules if r.klass == 'F'] +
+ [r for r in rules if r.klass == 'T']):
+ match = check_rule(rule, program_patterns)
+ if (rule.klass == 'F' and not match or
+ rule.klass == 'T' and len(match) != len(rule.condition)-1):
+ continue
+ found = True
+
+ # store suggestion to see if it was implemented later
+ if not any(s['program'] == program and s['rule'] == rule for s in suggested):
+ # passed: how many submission before PASS
+ # matched: list of submissions where the suggested rule matched
+ # (the current submission has index 0, the next 1 and so on)
+ suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []})
+
+ # get highlights
+ highlight = set()
+ for m in match:
+ for n in m[1]:
+ highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red')))
+
+ # print highighted program
+ print(str(i) + ' FAIL', end='\t')
+ color_print(program, list(highlight))
+
+ # print rule
+ for rule_pattern, yes in rule.condition:
+ if rule.klass == 'T':
+ if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+ print('good\t' + str(rule_pattern))
+ else:
+ print('missing\t' + str(rule_pattern))
+ else:
+ if rule_pattern in [pattern for pattern, nodes in program_patterns]:
+ print('buggy\t' + str(rule_pattern))
+ print()
+ break
+
+ if not found:
+ print(str(i) + ' FAIL\t' + str(program))
+ print()
+ unsuggestable[program] += 1
+
+ print('Suggestions and versions in which they were implemented:')
+ for s in suggested:
+ index = len(programs) - (s['passed'] + 1)
+ print(index, [index + m for m in s['matched']])
+ all_suggestions += suggested
+
+ print('-'*30)
+ print()
+
+# report
+not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']]
+matched = [s for s in all_suggestions if s['passed'] in s['matched']]
+
+# rules that did / did not match in the solution
+good = collections.Counter()
+bad = collections.Counter()
+for s in all_suggestions:
+ (good if s in matched else bad)[s['rule']] += 1
+
+print('Statistics')
+print('----------')
+print('# of suggestions that were implemented:', len(matched))
+print('# of suggestions that were not implemented:', len(not_matched))
+print('avg. # of submissions before suggestion was implemented:',
+ sum(s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS after suggestion was implemented:',
+ sum(s['passed'] - s['matched'][0] for s in matched)/len(matched))
+print('avg. # of submissions until PASS if suggestion was not implemented:',
+ sum(s['passed'] for s in not_matched)/len(not_matched))
+#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched))
+print()
+
+print('Unsuggestable programs')
+print('----------------------')
+for p, count in unsuggestable.most_common():
+ print('{}\t{}'.format(count, p))
+print()
+
+print('Good rules')
+print('----------')
+for r, count in good.most_common():
+ print('Suggested for ' + str(count) + ' submissions')
+ print(r)
+
+print('Bad rules')
+print('---------')
+for r, count in bad.most_common():
+ print('Suggested for ' + str(count) + ' submissions')
+ print(r)