From 27d4458613a5b61f16ad9bf59ca1de460fea3b3a Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Mon, 9 Jan 2017 18:07:23 +0100 Subject: First commit is the best commit --- .gitignore | 2 + db/__init__.py | 82 +++++++++ db/models.py | 173 ++++++++++++++++++ monkey/__init__.py | 0 monkey/action.py | 239 +++++++++++++++++++++++++ monkey/patterns.py | 511 +++++++++++++++++++++++++++++++++++++++++++++++++++++ prolog/__init__.py | 0 prolog/lexer.py | 130 ++++++++++++++ prolog/parser.py | 236 +++++++++++++++++++++++++ prolog/util.py | 238 +++++++++++++++++++++++++ test-rules.py | 225 +++++++++++++++++++++++ 11 files changed, 1836 insertions(+) create mode 100644 .gitignore create mode 100644 db/__init__.py create mode 100644 db/models.py create mode 100644 monkey/__init__.py create mode 100644 monkey/action.py create mode 100644 monkey/patterns.py create mode 100644 prolog/__init__.py create mode 100644 prolog/lexer.py create mode 100644 prolog/parser.py create mode 100644 prolog/util.py create mode 100755 test-rules.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ca28708 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +prolog/parsetab.py diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..783df6c --- /dev/null +++ b/db/__init__.py @@ -0,0 +1,82 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os +import threading +import psycopg2 + +__all__ = ['get_connection', 'return_connection', 'setup', 'models'] + +_module_access_lock = threading.Lock() + +_host = None # the database hostname/IP +_port = None # the database port number +_database = None # the name of the database +_username = None # the username to access the database +_password = None # the password to access the database + + +# database parameters setup + +def _get_port(): + try: + return int(os.environ.get('CODEQ_DB_PORT')) + except: + return 5432 + + +def setup( + host=os.environ.get('CODEQ_DB_HOST') or 'localhost', + port=_get_port(), + database=os.environ.get('CODEQ_DB_DATABASE') or 'codeq', + username=os.environ.get('CODEQ_DB_USER') or 'codeq', + password=os.environ.get('CODEQ_DB_PASS') or 'c0d3q' +): + """Sets the database location and authentication parameters.""" + global _host, _port, _database, _username, _password + _host = host + _port = port + _database = database + _username = username + _password = password + +# connection pooling + +_connection_pool = [] + + +def get_connection(): + """Retrieves a database connection from the connection pool.""" + with _module_access_lock: + if _host is None: + setup() # lazy init + if len(_connection_pool) > 0: + return _connection_pool.pop() + return psycopg2.connect(host=_host, port=_port, database=_database, user=_username, password=_password) + + +def return_connection(connection): + """Returns the given database connection to the pool.""" + try: + connection.rollback() # sanity check + except: + return + with _module_access_lock: + _connection_pool.append(connection) + + +if __name__ == '__main__': + setup() diff --git a/db/models.py b/db/models.py new file mode 100644 index 0000000..9edbec4 --- /dev/null +++ b/db/models.py @@ -0,0 +1,173 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import collections + +from . import get_connection, return_connection + +__all__ = ['CodeqUser', 'Solution'] + +class CodeqUser(collections.namedtuple('CodeqUser', ['id', 'username', 'password', 'name', 'email', 'is_admin', 'is_active', 'date_joined', 'last_login', 'gui_lang', 'robot_address', 'saml_data', 'gui_layout'])): + __sql_prefix = 'select id, username, password, name, email, is_admin, is_active, date_joined, last_login, gui_lang, robot_address, saml_data, gui_layout from codeq_user' + + @staticmethod + def get(**kwargs): + return _general_get(kwargs, CodeqUser, CodeqUser.__sql_prefix) + + @staticmethod + def list(): + return _general_list(CodeqUser, CodeqUser.__sql_prefix) + + @staticmethod + def filter(**kwargs): + return _general_filter(kwargs, CodeqUser, CodeqUser.__sql_prefix) + + @staticmethod + def solved_problems(user_id, language): + return _run_sql('select g.identifier, p.identifier from solution s inner join problem p on p.id = s.problem_id inner join problem_group g on g.id = p.problem_group_id inner join language l on l.id = p.language_id where s.codeq_user_id = %s and l.identifier = %s and s.done = True', (user_id, language), fetch_one=False) + +class Problem(collections.namedtuple('Problem', ['id', 'language', 'group', 'identifier'])): + __sql_prefix = '''\ + select p.id, l.identifier, g.identifier, p.identifier + from problem p + inner join problem_group g on g.id = p.problem_group_id + inner join language l on l.id = p.language_id''' + __sql_order = 'p.language_id, p.problem_group_id, p.id' + + @staticmethod + def get(**kwargs): + kwargs = {'p.'+k: v for k, v in kwargs.items()} + return _general_get(kwargs, Problem, Problem.__sql_prefix) + + @staticmethod + def list(): + return _general_list(Problem, Problem.__sql_prefix, order=Problem.__sql_order) + + @staticmethod + def filter(**kwargs): + kwargs = {'p.'+k: v for k, v in kwargs.items()} + return _general_filter(kwargs, Problem, Problem.__sql_prefix, order=Problem.__sql_order) + + # get a list of problems with the given language identifier + @staticmethod + def filter_language(language): + kwargs = {'l.identifier': language} + return _general_filter(kwargs, Problem, Problem.__sql_prefix, order=Problem.__sql_order) + +# known as Attempt in the original code +class Solution(collections.namedtuple('Solution', ['id', 'done', 'content', 'problem_id', 'codeq_user_id', 'trace'])): + __sql_prefix = 'select id, done, content, problem_id, codeq_user_id, trace from solution' + + @staticmethod + def get(**kwargs): + return _general_get(kwargs, Solution, Solution.__sql_prefix) + + @staticmethod + def list(): + return _general_list(Solution, Solution.__sql_prefix) + + @staticmethod + def filter(**kwargs): + return _general_filter(kwargs, Solution, Solution.__sql_prefix) + + +def _no_row_conversion(row): + return row + +def _general_get(kwargs_dict, clazz, sql_select, row_conversion_fn=_no_row_conversion): + conditions = [] + parameters = [] + for field_name, field_value in kwargs_dict.items(): + conditions.append(field_name + ' = %s') + parameters.append(field_value) + if len(conditions) == 0: + return None + conn = get_connection() + try: + cur = conn.cursor('crsr1') # a named cursor: scrolling is done on the server + cur.arraysize = 1 # scroll unit in the number of rows + try: + cur.execute(sql_select + ' where ' + ' and '.join(conditions), parameters) + row = cur.fetchone() + if row: + return clazz(*row_conversion_fn(row)) + return None + finally: + cur.close() + finally: + conn.commit() + return_connection(conn) + +def _general_filter(kwargs_dict, clazz, sql_select, row_conversion_fn=_no_row_conversion, order='id'): + conditions = [] + parameters = [] + for field_name, field_value in kwargs_dict.items(): + conditions.append(field_name + ' = %s') + parameters.append(field_value) + if len(conditions) == 0: + return _general_list(clazz, sql_select) + conn = get_connection() + try: + cur = conn.cursor('crsr2') # a named cursor: scrolling is done on the server + cur.arraysize = 10000 # scroll unit in the number of rows + try: + cur.execute(sql_select + ' where ' + ' and '.join(conditions) + ' order by ' + order, parameters) + result = [] + row = cur.fetchone() + while row: + result.append(clazz(*row_conversion_fn(row))) + row = cur.fetchone() + return result + finally: + cur.close() + finally: + conn.commit() + return_connection(conn) + +def _general_list(clazz, sql_select, row_conversion_fn=_no_row_conversion, order='id'): + conn = get_connection() + try: + cur = conn.cursor('crsr3') # a named cursor: scrolling is done on the server + cur.arraysize = 10000 # scroll unit in the number of rows + try: + cur.execute(sql_select + ' order by ' + order) + result = [] + row = cur.fetchone() + while row: + result.append(clazz(*row_conversion_fn(row))) + row = cur.fetchone() + return result + finally: + cur.close() + finally: + conn.commit() + return_connection(conn) + +def _run_sql(sql, params, fetch_one=False): + conn = get_connection() + try: + cur = conn.cursor() + try: + cur.execute(sql, params) + if fetch_one: + return cur.fetchone() + else: + return cur.fetchall() + finally: + cur.close() + finally: + conn.commit() + return_connection(conn) diff --git a/monkey/__init__.py b/monkey/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/monkey/action.py b/monkey/action.py new file mode 100644 index 0000000..c2f73ad --- /dev/null +++ b/monkey/action.py @@ -0,0 +1,239 @@ +#!/usr/bin/python3 + +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import sys + +class Action: + def __init__(self, abstime, data): + self.type = data['typ'] + self.time = abstime # time from start + + # generic actions + if self.type == 'open': + self.timestamp = data['time'] + elif self.type == 'ins': + self.type = 'insert' + self.offset = data['off'] + self.text = data['txt'] + self.length = len(self.text) + elif self.type == 'rm': + self.type = 'remove' + self.offset = data['off'] + self.text = data['txt'] + self.length = len(self.text) + elif self.type == 'test': + # data['feedback'] is a 'test_results' hint object, or a list + # containing such hint + hint = None + if isinstance(data['feedback'], list): + for hint in data['feedback']: + if hint['id'] == 'test_results': + break + else: + hint = data['feedback'] + if hint is not None: + self.n_correct = hint['args']['passed'] + self.n_all = hint['args']['total'] + else: + self.n_correct = self.n_all = None + elif self.type == 'hint': + self.feedback = data['feedback'] + elif self.type == 'hnt': + # obsolete Prolog hint action, with no additional info + self.type = 'hint' + self.feedback = None + elif self.type == 'plan': + self.index = data.get('index') + + # Prolog actions + elif self.type == 'prolog_solve': + self.query = data['query'] + elif self.type == 'slva': + # obsolete Prolog "solve all" action + self.type = 'prolog_solve' + self.query = data['qry'] + elif self.type == 'experiment': + self.data = data['data'] + + # Python actions + elif self.type == 'python_input': + self.text = data['txt'] + elif self.type == 'python_run': + self.program = data['program'] + + # robot actions + elif self.type == 'robot_run': + self.program = data['program'] + + def __str__(self): + s = 't = ' + str(self.time/1000.0) + ' ' + self.type + if self.type in {'insert', 'remove'}: + s += ': "' + self.text.replace('\n', '\\n').replace('\t', '\\t') + '" at ' + str(self.offset) + elif self.type == 'test': + s += ': {0} / {1}'.format(self.n_correct, self.n_all) + elif self.type == 'hint': + if self.feedback is not None: + s += ': ' + ', '.join(sorted([hint['id'] for hint in self.feedback])) + else: + s += ': ?' + elif self.type == 'plan': + if self.index is not None: + s += ': ' + str(self.index) + elif self.type == 'prolog_solve': + s += ': "' + self.query + '"' + elif self.type == 'experiment': + s += ': ' + self.data; + return s + + # apply this action to text + def apply(self, text): + if self.type == 'insert': + return text[:self.offset] + self.text + text[self.offset:] + elif self.type == 'remove': + return text[:self.offset] + text[self.offset+self.length:] + else: + return text + + # reverse the application of this action + def unapply(self, text): + if self.type == 'insert': + return text[:self.offset] + text[self.offset+self.length:] + elif self.type == 'remove': + return text[:self.offset] + self.text + text[self.offset:] + else: + return text + +# parse log from database into a list of actions, cleaning up some fluff. +# ignore non-text actions (queries and tests) +def parse(data): + if data == None: + return [], [] + + actions = [] + incorrect = set() + + time = 0 + code = '' + for packet in data: + try: + time += packet['dt'] + action = Action(time, packet) + except: + # ignore any errors while decoding a packet + sys.stderr.write('Error decoding packet: {}\n'.format(packet)) + continue + + # skip normalization if this is the first action + if actions == []: + actions.append(action) + code = action.apply(code) + continue + + # add to list of actions; modify previously added action if necessary + prev = actions[-1] + + # remove superfluous REMOVE action when newline is inserted (due to editor auto-indent) + if prev.type == 'remove' and action.type == 'insert' and \ + action.time == prev.time and \ + action.offset == prev.offset and action.length > prev.length and \ + action.text[action.length-prev.length:] == prev.text: + # discard last REMOVE action + code = prev.unapply(code) + actions.pop() + + # replace current action with something better + length = action.length - prev.length + new = Action(prev.time, {'typ': 'ins', 'off': prev.offset, 'txt': action.text[:length]}) + actions.append(new) + code = new.apply(code) + + # remove superfluous INSERT action when newline is removed (due to editor auto-indent) + elif prev.type == 'remove' and action.type == 'insert' and \ + action.time == prev.time and \ + action.offset == prev.offset and action.length < prev.length and \ + prev.text[prev.length-action.length:] == action.text: + # discard last INSERT action + code = prev.unapply(code) + actions.pop() + + # replace current action with something better + length = prev.length - action.length + new = Action(prev.time, {'typ': 'rem', 'off': prev.offset, 'txt': prev.text[:length]}) + actions.append(new) + code = new.apply(code) + + # otherwise, simply append the current action + else: + actions.append(action) + code = action.apply(code) + + return actions + +# expand any multi-char actions (does not do anything for the 99%) +def expand(actions): + i = 0 + while i < len(actions): + if actions[i].type == 'insert' and len(actions[i].text) > 1: + a = actions.pop(i) + for offset in range(len(a.text)): + actions.insert(i+offset, + Action(a.time, {'typ': 'ins', 'off': a.offset+offset, 'txt': a.text[offset]})) + i += len(a.text) + elif actions[i].type == 'remove' and len(actions[i].text) > 1: + a = actions.pop(i) + for offset in range(len(a.text)): + actions.insert(i, + Action(a.time, {'typ': 'rm', 'off': a.offset+offset, 'txt': a.text[offset]})) + i += len(a.text) + else: + i += 1 + +# some sample code +if __name__ == '__main__': + from db.models import Problem, Solution + + # print all problem ids + print('problems:') + for problem in Problem.list(): + print(' {}\t{}'.format(problem.id, problem.identifier)) + print() + + pid = input('enter problem id: ') + + # print all attempt ids for the selected problem + print('users solving problem ' + str(pid) + ':') + attempts = Solution.filter(problem_id=pid) + print(', '.join([str(attempt.codeq_user_id) for attempt in attempts])) + print() + + uid = input('enter user id: ') + attempt = Solution.get(problem_id=pid, codeq_user_id=uid) + + try: + actions = parse(attempt.trace) + print('read ' + str(len(actions)) + ' actions from log') + + print('code versions for this attempt:') + code = '' + for action in actions: + code = action.apply(code) + print(action) + print(code.strip()) + print() + except Exception as ex: + sys.stderr.write('Error parsing action log: ' + str(ex)) diff --git a/monkey/patterns.py b/monkey/patterns.py new file mode 100644 index 0000000..ae958a0 --- /dev/null +++ b/monkey/patterns.py @@ -0,0 +1,511 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import collections +from itertools import combinations +import pickle +import random +import re +import sys + +from nltk import ParentedTree, Tree +from nltk.tgrep import tgrep_positions + +from prolog.util import Token, parse as prolog_parse + +# construct pattern to match the structure of nodes given by [include] +def pattern(node, include): + if isinstance(node, Token): + if node.type == 'NAME': + return '"{}"'.format(node.val) + return None + + if any(n is node for n in include): + if node.label() == 'variable': + return 'variable <: "{}"'.format(node[0].val) + return None + + label = node.label() + subpats = [pattern(child, include) for child in node] + + if label == 'functor': + return '{} <1 ({})'.format(label, subpats[0]) + + pat = None + if any(subpats): + if label == 'and': + if subpats[1]: + pat = subpats[1] + if subpats[0]: + if pat: + pat = subpats[0] + ' .. (' + pat + ')' + else: + pat = subpats[0] + elif label == 'args': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' <{} ({})'.format(i+1, subpat) + elif label == 'binop': + pat = label + if subpats[0]: + pat += ' <1 ({})'.format(subpats[0]) + pat += ' <2 ("{}")'.format(node[1].val) + if subpats[2]: + pat += ' <3 ({})'.format(subpats[2]) + elif label == 'clause': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i]) + elif label == 'compound': + if len(subpats) > 1 and subpats[1]: + pat = label + for i, subpat in enumerate(subpats): + pat += ' <{} ({})'.format(i+1, subpat) + else: + return None + elif label == 'head': + pat = label + pat += ' <1 ({})'.format(subpats[0]) + if not pat: + for s in subpats: + if s: + pat = s + break + return pat + +# construct a pattern connecting (variable) node [a] to [b] +def connect(a, b): + path_a = [] + node = a + while node.parent(): + node = node.parent() + path_a.insert(0, node) + + path_b = [] + node = b + while node.parent(): + node = node.parent() + path_b.insert(0, node) + + common_ancestor = None + for i in range(min(len(path_a), len(path_b))): + if path_a[i] is not path_b[i]: + break + if path_a[i].label() in {'compound', 'head'}: + break + common_ancestor = path_a[i] + + def node_label(node): + if node.label() == 'compound': + right = node[1] + nargs = 1 + while right._label == "args" and len(right) == 2: + right = right[1] + nargs += 1 + return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs) + if node.label() == 'binop': + return 'binop <2 "{}"'.format(node[1].val) + return node.label() + i = 0 + while path_a[i].label() != 'clause': + i += 1 + + # path from top to common ancestor + pat = path_a[i].label() + i += 1 + n_top = 0 + while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]: + node = path_a[i] + i += 1 + if node.label() == 'and': + continue + if node.parent().label() == 'and': + op = '<+(and)' + else: + op = '<{}'.format(node.parent_index()+1) + pat += ' {} ({}'.format(op, node_label(node)) + n_top += 1 + + path_a = path_a[i:] + path_b = path_b[i:] + + # path from common ancestor to [a] + n_a = 0 + for node in path_a: + if node.label() == 'and': + continue + op = '<' + if node.parent().label() == 'and': + op = '<+(and)' + elif node.parent_index() is not None: + op = '<{}'.format(node.parent_index()+1) + pat += ' {} ({}'.format(op, node_label(node)) + n_a += 1 + pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val) + pat += ')' * n_a + + # path from common ancestor to [b] + n_b = 0 + for node in path_b: + if node.label() == 'and': + continue + op = '<' + if node.parent().label() == 'and': + op = '<+(and)' + elif node.parent_index() is not None: + op = '<{}'.format(node.parent_index()+1) + pat += ' {} ({}'.format(op, node_label(node)) + n_b += 1 + pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val) + + pat += ')' * (n_top + n_b) + return pat + +# replace variable names with patterns and backlinks +def postprocess(pattern): + macros = '@ VAR /[A-Z]/; ' + #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern) + m = re.search(r'"[A-Z]_[0-9]+"', pattern) + nvars = 0 + while m is not None: + orig_name = m.group(0) + n = orig_name.strip('"').split("_")[1] + pat_name = 'v{}'.format(nvars) + nvars += 1 + pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():] + for m in re.finditer(orig_name, pattern): + pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():] + m = re.search(r'"([A-Z]*_[0-9]+)"', pattern) + return macros + pattern + +def postprocess_simple(pattern): + pattern = postprocess(pattern) + if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"): + return None + #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern) + relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern) + + # elements + elements = [] + current = "" + i = 0 + par = 0 + while i < len(pattern): + if par > 0 and pattern[i] == ")": + par -= 1 + if par == 0: + elements.append(current) + current = "" + if par > 0 and pattern[i] == "(": + par += 1 + if par == 0 and \ + (pattern[i:].startswith("(head") or + pattern[i:].startswith("(compound") or + pattern[i:].startswith("(binop")): + par = 1 + if par > 0: + current += pattern[i] + i += 1 + # simplify variable + for ei, e in enumerate(elements): + # #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e) + elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")" + elements = sorted(elements) + #print(pattern) + #print(relevant) + #return "_".join(relevant)#pattern + return " ".join(elements)#pattern + +# construct pattern to match the structure of nodes given by [include], +# supports variables and literals +def pattern2(node, include): + if isinstance(node, Token): + return None + + label = node.label() + if any(n is node for n in include): + if label == 'literal': + return '"{}"'.format(node[0].val) + if label == 'variable': + return '{}'.format(label) + return None + if label == 'functor': + return '({} "{}")'.format(label, node[0].val) + + subpats = [pattern2(child, include) for child in node] + pat = None + if any(subpats): + if label == 'and': + if subpats[1]: + pat = subpats[1] + if subpats[0]: + if pat: + pat = subpats[0] + ' ' + pat + else: + pat = subpats[0] + elif label == 'args': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' {}'.format(subpat) + pat = '(' + pat + ')' + elif label == 'unop': + pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')' + elif label == 'binop': + pat = label + if subpats[0]: + pat += ' {}'.format(subpats[0]) + pat += ' "{}"'.format(node[1].val) + if subpats[2]: + pat += ' {}'.format(subpats[2]) + pat = '(' + pat + ')' + elif label == 'clause': + pat = label + for i, subpat in enumerate(subpats): + if subpat: + pat += ' {}'.format(subpats[i]) + return '(' + pat + ')' + elif label == 'compound': + if len(subpats) > 1 and subpats[1]: + pat = label + for i, subpat in enumerate(subpats): + pat += ' {}'.format(subpat) + pat = '(' + pat + ')' + else: + return None + elif label == 'head': + pat = label + pat += ' {}'.format(subpats[0]) + pat = '(' + pat + ')' + elif label == 'list': + pat = 'list ' + if subpats[0]: + pat += '(h {})'.format(subpats[0]) + if subpats[0] and subpats[1]: + pat += ' ' + if subpats[1]: + pat += '(t {})'.format(subpats[1]) + pat = '(' + pat + ')' + if not pat: + for s in subpats: + if s: + pat = s + break + return pat + +def get_patterns(tree): + orig = tree + if isinstance(tree, str): + tree = prolog_parse(tree) + if tree is None: + return + tree = ParentedTree.convert(tree) + + # get patterns separately for each clause + for clause in tree: + all_variables = [] + variables = collections.defaultdict(list) + def walk(node): + if isinstance(node, Tree): + if node.label() == 'variable': + name = node[0].val + variables[name].append(node) + all_variables.append(node) + else: + for child in node: + walk(child) + walk(clause) + + # connecting pairs of nodes with same variable + for var, nodes in variables.items(): + for selected in combinations(nodes, 2): + pat = pattern2(clause, selected) + if pat: + print(pat) + yield pat, selected + #pat = connect(*selected) + #if pat: + # pp = postprocess_simple(pat) + # if pp: + # yield pp, selected + + # add singletons + for var, nodes in variables.items(): + if len(nodes) == 1: + pat = pattern2(clause, nodes) + if pat: + print(pat) + yield pat, nodes + #pat = pattern(tree, var) + #if pat: + # pp = postprocess_simple(pat) + # if pp: + # yield pp, nodes + + # add patterns for literal/variable pairs + literals = [node for node in clause.subtrees() if node.label() == 'literal'] + for literal in literals: + top = literal + while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}: + top = top.parent() + variables = [node for node in top.subtrees() if node.label() == 'variable'] + for var in variables: + pat = pattern2(clause, [var, literal]) + if pat: + yield pat, [var, literal] + +# # connecting pairs of nodes with variables +# for selected in combinations(all_variables, 2): +# pat = connect(selected[0], selected[1]) +# if pat: +# yield postprocess(pat), selected + +# # using pairs of nodes with variables +# for selected in combinations(all_variables, 2): +# pat = pattern(tree, selected) +# if pat: +# yield postprocess(pat), selected + +# # using pairs of nodes with same variable +# for var, nodes in variables.items(): +# for selected in combinations(nodes, 2): +# pat = pattern(tree, selected) +# if pat: +# yield postprocess(pat), selected + +# # using each variable separately +# for var, nodes in variables.items(): +# pat = pattern(tree, nodes) +# if pat: +# yield postprocess(pat), nodes + +# # using each goal to select variables FIXME +# goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}] +# for goal in goals: +# goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'} +# pat = pattern(tree, goal_vars) +# if pat: +# yield postprocess(pat) + +# nltk.tgrep does not play nice with non-Tree leaves +def _tgrep_prepare(tree): + if not isinstance(tree, ParentedTree): + tree = ParentedTree.convert(tree) + def prepare(node): + if isinstance(node, Token) or isinstance(node, str): + return ParentedTree(str(node), []) + return ParentedTree(node.label(), [prepare(child) for child in node]) + return prepare(tree) + +def find_motif(tree, motif): + tree = _tgrep_prepare(tree) + return tgrep_positions(motif, [tree]) + +# Extract edits and other data from existing traces for each problem. +# Run with: python3 -m monkey.patterns +if __name__ == '__main__': + # Ignore traces from these users. + ignored_users = [ + 1, # admin + 231, # test + 360, # test2 + 358, # sasha + ] + + pid = int(sys.argv[1]) + name = sys.argv[2] + submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb')) + + print('Analyzing programs for problem {}…'.format(pid)) + ndata = { + 'train': [], + 'test': [] + } + # get all users + users = set() + for code, data in submissions.items(): + users |= data['users'] + users = list(users) + users.sort() + random.Random(0).shuffle(users) + split = int(len(users)*0.7) + learn_users = set(users[:split]) + test_users = set(users[split:]) + print(test_users) + + for code, data in submissions.items(): + if len(code) > 1000 or prolog_parse(code) is None: + continue + if name not in code: + continue + ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users) + ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users) + #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users']) + + print('correct: {} ({} unique)'.format( + len([code for code, correct in ndata['train'] if correct]), + len({code for code, correct in ndata['train'] if correct}))) + print('incorrect: {} ({} unique)'.format( + len([code for code, correct in ndata['train'] if not correct]), + len({code for code, correct in ndata['train'] if not correct}))) + + #iprograms.sort() + #random.Random(0).shuffle(programs) + #split = int(len(programs)*0.7) + #data = { + # 'train': programs[:split], + # 'test': programs[split:], + #} + data = ndata + + # extract attributes from training data + patterns = collections.Counter() + for code, correct in data['train']: + for pat, nodes in get_patterns(code): + patterns[pat] += 1 + + attrs = [] + with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file: + for i, (pat, count) in enumerate(patterns.most_common()): + if count < 1: + break + attrs.append(pat) + print('a{}\t{}'.format(i, pat), file=pattern_file) + + # check and write attributes for training/test data + for t in ['train', 'test']: + with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f: + # print header + print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f) + print('\t'.join(['d'] * (len(attrs)+2)), file=f) + print('meta\tclass', file=f) + for code, correct in data[t]: + record = '{}\t{}'.format(repr(code), 'T' if correct else 'F') + + ## check attributes by using tgrep to find patterns + #tree = _tgrep_prepare(prolog_parse(code)) + #for pat in attrs: + # matches = list(tgrep_positions(pat, [tree])) + # record += '\t{}'.format(bool(matches[0])) + + # check attributes by enumerating patterns in code + code_pats = [pat for pat, nodes in get_patterns(code)] + for pat in attrs: + record += '\t{}'.format('T' if pat in code_pats else 'F') + + print(record, file=f) diff --git a/prolog/__init__.py b/prolog/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/prolog/lexer.py b/prolog/lexer.py new file mode 100644 index 0000000..5023e57 --- /dev/null +++ b/prolog/lexer.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 + +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import ply.lex as lex + +# LEXER + +#states = ( +# ('comment', 'exclusive'), +#) + +# tokens; treat operators as names if followed by ( +operators = { + r':-': 'FROM', + r'-->': 'FROMDCG', + r'->': 'IMPLIES', + r'\+': 'NOT', + r'=': 'EQU', + r'\=': 'NEQU', + r'==': 'EQ', + r'\==': 'NEQ', + r'=..': 'UNIV', + r'is': 'IS', + r'=:=': 'EQA', + r'=\=': 'NEQA', + r'<': 'LT', + r'=<': 'LE', + r'>': 'GT', + r'>=': 'GE', + r'@<': 'LTL', + r'@=<': 'LEL', + r'@>': 'GTL', + r'@>=': 'GEL', + r'#=': 'EQFD', + r'#\=': 'NEQFD', + r'#<': 'LTFD', + r'#=<': 'LEFD', + r'#>': 'GTFD', + r'#>=': 'GEFD', + r'in': 'IN', + r'ins': 'INS', + r'..': 'THROUGH', + r'+': 'PLUS', + r'-': 'MINUS', + r'*': 'STAR', + r'/': 'DIV', + r'//': 'IDIV', + r'mod': 'MOD', + r'**': 'POW', + r'^': 'POW', + r'.': 'PERIOD', + r',': 'COMMA', + r';': 'SEMI' +} +tokens = sorted(list(operators.values())) + [ + 'UINTEGER', 'UREAL', + 'NAME', 'VARIABLE', 'STRING', + 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'PIPE', 'LBRACE', 'RBRACE', + 'INVALID' +] + +# punctuation +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_PIPE = r'\|' +t_LBRACE = r'{' +t_RBRACE = r'}' + +t_UINTEGER = r'[0-9]+' +t_UREAL = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan' +t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*' +t_STRING = r'"(""|\\.|[^\"])*"' + +# no support for nested comments yet +def t_comment(t): + r'(/\*(.|\n)*?\*/)|(%.*)' + pass + +def t_NAME(t): + r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|," + if t.value == ',' or \ + t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(': + t.type = operators.get(t.value, 'NAME') + return t + +t_ignore = ' \t' + +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +def t_error(t): + # TODO send this to stderr + #print("Illegal character '" + t.value[0] + "'") + t.type = 'INVALID' + t.value = t.value[0] + t.lexer.skip(1) + return t + +lexer = lex.lex(errorlog=lex.NullLogger()) + +if __name__ == '__main__': + while True: + try: + s = input('> ') + except EOFError: + break + if not s: + continue + + lexer.input(s) + tokens = list(lexer) + print(tokens) diff --git a/prolog/parser.py b/prolog/parser.py new file mode 100644 index 0000000..4d4f9e9 --- /dev/null +++ b/prolog/parser.py @@ -0,0 +1,236 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from nltk import Tree +import ply.yacc as yacc +from .lexer import tokens +from .util import Token + +# PARSER +precedence = ( + ('nonassoc', 'FROM', 'FROMDCG'), + ('right', 'PIPE'), + ('right', 'IMPLIES'), + ('right', 'NOT'), + ('nonassoc', 'EQU', 'NEQU', 'EQ', 'NEQ', 'UNIV', 'IS', 'EQA', 'NEQA', 'LT', 'LE', 'GT', 'GE', 'LTL', 'LEL', 'GTL', 'GEL', 'IN', 'INS', 'THROUGH', 'EQFD', 'NEQFD', 'LTFD', 'LEFD', 'GTFD', 'GEFD'), + ('left', 'PLUS', 'MINUS'), + ('left', 'STAR', 'DIV', 'IDIV', 'MOD'), + ('nonassoc', 'POW'), + ('right', 'UMINUS', 'UPLUS'), + ('nonassoc', 'UINTEGER', 'UREAL'), + ('nonassoc', 'NAME', 'VARIABLE', 'STRING'), + ('nonassoc', 'PERIOD'), + ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE') +) + +def make_token(p, n): + lextoken = p.slice[n] + return Token(lextoken.type, lextoken.value, lextoken.lexpos) + +def p_text_empty(p): + 'text : ' + p[0] = Tree('text', []) +def p_text_clause(p): + 'text : text clause' + p[0] = p[1] + p[0].append(p[2]) + +def p_clause_head(p): + 'clause : head PERIOD' + p[0] = Tree('clause', [p[1]]) +def p_clause_rule(p): + '''clause : head FROM or PERIOD + | head FROMDCG or PERIOD''' + p[0] = Tree('clause', [p[1], p[3]]) + +def p_head(p): + 'head : term' + p[0] = Tree('head', [p[1]]) + +def p_or_single(p): + 'or : if' + p[0] = p[1] +def p_or_if(p): + 'or : or SEMI if' + p[0] = Tree('or', [p[1], p[3]]) + +def p_if_single(p): + 'if : and' + p[0] = p[1] +def p_if_and(p): + 'if : and IMPLIES if' + p[0] = Tree('if', [p[1], p[3]]) + +def p_and_single(p): + 'and : term' + p[0] = p[1] +def p_and_term(p): + 'and : term COMMA and' + p[0] = Tree('and', [p[1], p[3]]) + +# Special case for zero-arity predicates supported by SWI-Prolog. +def p_term_functor_zero(p): + 'term : functor LPAREN RPAREN' + # No whitespace allowed between functor and LPAREN. + t2 = make_token(p, 2) + if p[1][0].pos + len(p[1][0].val) < t2.pos: + raise SyntaxError('whitespace before ' + str(t2)) + p[0] = Tree('compound', [p[1]]) +def p_term_functor(p): + 'term : functor LPAREN args RPAREN' + # No whitespace allowed between functor and LPAREN. + t2 = make_token(p, 2) + if p[1][0].pos + len(p[1][0].val) < t2.pos: + raise SyntaxError('whitespace before ' + str(t2)) + p[0] = Tree('compound', [p[1], p[3]]) + +def p_term_or(p): + 'term : LPAREN or RPAREN' + p[0] = p[2] +def p_term_binary(p): + '''term : term PLUS term + | term MINUS term + | term STAR term + | term POW term + | term DIV term + | term IDIV term + | term MOD term + + | term EQU term + | term NEQU term + | term EQ term + | term NEQ term + | term UNIV term + | term IS term + + | term EQA term + | term NEQA term + | term LT term + | term LE term + | term GT term + | term GE term + + | term LTL term + | term LEL term + | term GTL term + | term GEL term + + | term PIPE term + | term THROUGH term + | term IN term + | term INS term + | term EQFD term + | term NEQFD term + | term LTFD term + | term LEFD term + | term GTFD term + | term GEFD term''' + p[0] = Tree('binop', [p[1], make_token(p, 2), p[3]]) +def p_term_unary(p): + '''term : NOT term + | MINUS term %prec UMINUS + | PLUS term %prec UPLUS''' + p[0] = Tree('unop', [make_token(p, 1), p[2]]) +def p_term_list(p): + 'term : list' + p[0] = p[1] + +def p_term_variable(p): + 'term : VARIABLE' + p[0] = Tree('variable', [make_token(p, 1)]) +def p_term_simple(p): + '''term : STRING + | UINTEGER + | UREAL''' + p[0] = Tree('literal', [make_token(p, 1)]) +def p_term_name(p): + 'term : NAME' + p[0] = make_token(p, 1) + +def p_term_clpr(p): + 'term : LBRACE clpr RBRACE' + p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)]) + +# compound term arguments +def p_args_single(p): + 'args : term' + p[0] = Tree('args', [p[1]]) +def p_args_multiple(p): + 'args : term COMMA args' + p[0] = Tree('args', [p[1], p[3]]) + +# list elements +def p_elems_single(p): + 'elems : term' + if isinstance(p[1], Tree) and p[1].label() == 'binop' and p[1][1].type == 'PIPE': + p[0] = Tree('list', [p[1][0]]) + if p[1][2] != Tree('term', [Token(type='NIL', val='[]')]): + p[0].append(p[1][2]) + else: + p[0] = Tree('list', [p[1], Token(type='NIL', val='[]')]) +def p_elems_multiple(p): + 'elems : term COMMA elems' + p[0] = Tree('list', [p[1], p[3]]) + +def p_list_empty(p): + 'list : LBRACKET RBRACKET' + p[0] = Tree('literal', [Token(type='NIL', val='[]')]) +def p_list(p): + 'list : LBRACKET elems RBRACKET' + p[0] = p[2] + +def p_functor(p): + 'functor : NAME' + p[0] = Tree('functor', [make_token(p, 1)]) + +# CLP(R) syntax +def p_clpr_single(p): + 'clpr : clpr_constr' + p[0] = Tree('clpr', [p[1]]) +def p_clpr_more(p): + '''clpr : clpr_constr COMMA clpr + | clpr_constr SEMI clpr''' + p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]]) +# XXX temporary until the new parser is in place, this also covers { } notation for DCGs +def p_clpr_constr(p): + 'clpr_constr : term' + p[0] = p[1] + +def p_error(t): + if t is None: + raise SyntaxError('unexpected end of file') + else: + raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value)) + +parser = yacc.yacc(debug=False) + +if __name__ == '__main__': + from .util import stringify + while True: + try: + s = input('> ') + except EOFError: + break + if not s: + continue + ast = parser.parse(s) + def pp(node): + if isinstance(node, Tree): + return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')' + return '"' + str(node) + '"' + print(pp(ast)) + #ast.pretty_print() + #print(stringify(ast)) diff --git a/prolog/util.py b/prolog/util.py new file mode 100644 index 0000000..402b920 --- /dev/null +++ b/prolog/util.py @@ -0,0 +1,238 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from collections import namedtuple +from collections.abc import Iterable +import string + +from nltk import Tree + +# Stores a token's type and value, and optionally the position of the first +# character in the lexed stream. +class Token(namedtuple('Token', ['type', 'val', 'pos'])): + __slots__ = () + + # Custom constructor to support default parameters. + def __new__(cls, type, val='', pos=None): + return super(Token, cls).__new__(cls, type, val, pos) + + def __str__(self): + return str(self.val) + + # Only consider type and value when comparing tokens. There is probably a + # cleaner way of doing this. + __eq__ = lambda x, y: x[0] == y[0] and x[1] == y[1] + __ne__ = lambda x, y: x[0] != y[0] or x[1] != y[1] + __lt__ = lambda x, y: tuple.__lt__(x[0:2], y[0:2]) + __le__ = lambda x, y: tuple.__le__(x[0:2], y[0:2]) + __ge__ = lambda x, y: tuple.__ge__(x[0:2], y[0:2]) + __gt__ = lambda x, y: tuple.__gt__(x[0:2], y[0:2]) + + # Only hash token's value (we don't care about position, and types are + # determined by values). + def __hash__(self): + return hash(self[1]) + + # Return a copy of this token, possibly modifying some fields. + def clone(self, type=None, val=None, pos=None): + return Token(self.type if type is None else type, + self.val if val is None else val, + self.pos if pos is None else pos) + +from .lexer import lexer, operators +from .parser import parser + +def parse(code): + try: + return parser.parse(code) + except SyntaxError: + return None + +# Return a list of tokens in [text]. +def tokenize(text): + lexer.input(text) + return [Token(t.type, t.value, t.lexpos) for t in lexer] + +# Return a one-line string representation of [obj] which may be a Tree or a +# list of tokens. +def stringify(obj): + if isinstance(obj, Token): + if obj.type in ('PERIOD', 'COMMA'): + return str(obj) + ' ' + if obj.type in operators.values(): + return ' ' + str(obj) + ' ' + return str(obj) + if isinstance(obj, Iterable): + if isinstance(obj, Tree) and obj.label() == 'clause': + return ''.join([stringify(child) for child in obj]) + '\n' + return ''.join([stringify(child) for child in obj]) + +# Return a canonical name for the [n]th variable in scope. +def canonical_varname(n): + names = string.ascii_uppercase + if n < len(names): + return names[n] + return 'X{}'.format(n) + +# Rename variables in [tokens] to A0, A1, A2,… in order of appearance. +def rename_vars_list(tokens, names=None): + if names is None: + names = {} + next_id = len(names) + + # Return a new list. + tokens = list(tokens) + for i, t in enumerate(tokens): + if t.type == 'VARIABLE' and t.val != '_': + cur_name = t.val + if cur_name not in names: + names[cur_name] = canonical_varname(next_id) + next_id += 1 + tokens[i] = t.clone(val=names[cur_name]) + return tokens + +# Rename variables in AST rooted at [root] to A0, A1, A2,… in order of +# appearance. +def rename_vars_ast(root, fixed_names=None): + if fixed_names is None: + fixed_names = {} + names = {} + next_id = len(fixed_names) + len(names) + + def rename_aux(node): + nonlocal fixed_names, names, next_id + if isinstance(node, Tree): + if node.label() == 'clause': + names = {} + next_id = len(fixed_names) + len(names) + new_children = [rename_aux(child) for child in node] + new_node = Tree(node.label(), new_children) + elif isinstance(node, Token): + if node.type == 'VARIABLE': + token = node + if token.val.startswith('_'): + new_node = token.clone(val=canonical_varname(next_id)) + next_id += 1 + else: + cur_name = token.val + if cur_name in fixed_names: + new_name = fixed_names[cur_name] + else: + if cur_name not in names: + names[cur_name] = canonical_varname(next_id) + next_id += 1 + new_name = names[cur_name] + new_node = token.clone(val=new_name) + else: + new_node = node + return new_node + return rename_aux(root) + +# Yield "interesting" parts of a Prolog AST as lists of tokens. +def interesting_ranges(ast, path=()): + if ast.label() in {'clause', 'head', 'or', 'if', 'and'}: + if ast.label() == 'clause': + # Special case for clause with one goal. + if len(ast) == 4 and ast[2].label() == 'term': + terminals = ast[2].leaves() + yield terminals, path + (ast.label(), 'and') + + if ast.label() == 'and': + for i in range(0, len(ast), 2): + for j in range(i, len(ast), 2): + subs = ast[i:j+1] + terminals = [] + for s in subs: + terminals.extend([s] if isinstance(s, Token) else s.leaves()) + # We want at least some context. + if len(terminals) > 1: + yield terminals, path + (ast.label(),) + else: + terminals = ast.leaves() + # We want at least some context. + if len(terminals) > 1: + yield terminals, path + (ast.label(),) + + for subtree in ast: + if isinstance(subtree, Tree): + yield from interesting_ranges(subtree, path + (ast.label(),)) + +# Map "formal" variable names in the edit a→b to actual names in code [tokens]. +# The set [variables] contains all variable names in the current scope. These +# are used in cases such as [A]→[A,B], where the edit introduces new variables. +# Return a new version of b with actual variable names. +def map_vars(a, b, tokens, variables): + mapping = {} + new_index = 0 + for i in range(len(a)): + if tokens[i].type == 'VARIABLE': + formal_name = a[i].val + if tokens[i].val != '_': + actual_name = tokens[i].val + else: + actual_name = 'New'+str(new_index) + new_index += 1 + mapping[formal_name] = actual_name + + remaining_formal = [t.val for t in b if t.type == 'VARIABLE' and t.val not in mapping.keys()] + remaining_actual = [var for var in variables if var not in mapping.values()] + + while len(remaining_actual) < len(remaining_formal): + remaining_actual.append('New'+str(new_index)) + new_index += 1 + + for i, formal_name in enumerate(remaining_formal): + mapping[formal_name] = remaining_actual[i] + + return [t if t.type != 'VARIABLE' else t.clone(val=mapping[t.val]) for t in b] + +# Return a set of predicate names (e.g. conc/3) used in [program]. +def used_predicates(program): + predicates = set() + def walk(tree, dcg=False): + if isinstance(tree, Tree): + # DCG predicates can be called without parameters + if tree.label() == 'clause' and len(tree) == 4 and \ + tree[1].type == 'FROMDCG': + dcg = True + if tree.label() == 'term' and len(tree) >= 3 and \ + isinstance(tree[0], Tree) and tree[0].label() == 'functor': + if len(tree) == 3: + predicates.add('{}/0'.format(tree[0][0])) + else: + predicates.add('{}/{}'.format(tree[0][0], (len(tree[2])+1)//2)) + for subtree in tree: + walk(subtree, dcg) + elif isinstance(tree, Token): + if dcg and tree.type == 'NAME': + predicates.add('{}/{}'.format(tree.val, 2)) + predicates.add('{}/{}'.format(tree.val, 3)) + predicates.add('{}/{}'.format(tree.val, 4)) + tree = parse(program) + if tree is not None: + walk(tree) + return predicates + +# Basic sanity check. +if __name__ == '__main__': + var_names = {} + before = rename_vars(tokenize("dup([A0|A1], [A2|A3])"), var_names) + after = rename_vars(tokenize("dup([A0|A1], [A5, A4|A3])"), var_names) + + line = lines[0] + variables = [t.val for t in tokenize(code) if t.type == 'VARIABLE'] + mapped = map_vars(before, after, line, variables) + print(mapped) diff --git a/test-rules.py b/test-rules.py new file mode 100755 index 0000000..bb89e7e --- /dev/null +++ b/test-rules.py @@ -0,0 +1,225 @@ +#!/usr/bin/python3 + +import collections +import os.path +import pickle +import re +from statistics import mean +import sys + +from termcolor import colored + +from monkey.action import parse as parse_trace +from monkey.patterns import get_patterns +from prolog.util import parse as prolog_parse, rename_vars_list, stringify, tokenize + +# script arguments +solutions_file = sys.argv[1] +pid = int(sys.argv[2]) +data_dir = sys.argv[3] + +attributes_file = os.path.join(data_dir, 'attributes') +rules_file = os.path.join(data_dir, 'rules') +users_file = os.path.join(data_dir, 'users-test') +programs_file = os.path.join(data_dir, 'programs.pickle') + +# read test results for known programs +test = pickle.load(open(programs_file, 'rb')) + +# read traces +users = [int(line.strip()) for line in open(users_file, 'r').readlines()] +traces = {} +for solution in pickle.load(open(solutions_file, 'rb')): + if solution.problem_id == pid and solution.codeq_user_id in users: + traces[solution.codeq_user_id] = solution.trace + +# read attributes +attributes = dict([line.strip().split('\t') for line in open(attributes_file, 'r').readlines()]) + +class Rule(collections.namedtuple('Rule', ['klass', 'condition', 'distribution', 'quality'])): + def __str__(self): + s = 'Rule: class = {}, distribution = {}, quality = {}\n'.format(self.klass, self.distribution, self.quality) + s += ''.join([str(pattern) + '\n' for pattern, yes in self.condition]) + return s + +# read rules +rules = [] +for line in open(rules_file, 'r').readlines(): + match = re.match(r'IF ((?:a[0-9]*[^ ]*(?: AND )*)*) THEN correct=([TF]) *\[ *([0-9]*) *([0-9]*)\] *([0-9.]*)', line.strip()) + if match: + m = tuple(match.groups()) + condition = tuple((attributes[field[:-3]], field.endswith('!=F')) for field in m[0].split(' AND ')) + rules.append(Rule(m[-4], condition, (int(m[-3]), int(m[-2])), float(m[-1]))) + #print(rules[-1]) + else: + print('Did not understand rule:', line.strip()) + +def color_print(text, ranges): + i = 0 + for start, length, color in sorted(ranges): + # ignore overlapping ranges + if start < i: + continue + print(text[i:start], end='') + print(colored(text[start:start+length], color), end='') + i = start + length + print(text[i:]) + +# check if given patterns match the rule +def check_rule(rule, patterns): + ret_patterns = [] + for rule_pattern, yes in rule.condition: + if yes: + # this pattern must be present + for pattern, nodes in patterns: + if pattern == rule_pattern: + ret_patterns.append((rule_pattern, nodes)) + else: + # this pattern must not be present + if rule_pattern in [p[0] for p in patterns]: + return [] + return ret_patterns + +# keep track of when each suggestion was applied +all_suggestions = [] +# programs with no matching rule +unsuggestable = collections.Counter() + +for user, trace in traces.items(): + # get submissions from trace + programs = [] + code = '' + for action in parse_trace(trace): + code = action.apply(code) + if action.type == 'test': + if prolog_parse(code) is None: + continue + normalized_code = stringify(rename_vars_list(tokenize(code))) + if programs and normalized_code == programs[-1][0]: + continue + correct = test[normalized_code]['n_tests'] == test[normalized_code]['n_passed'] + programs.append((normalized_code, correct)) + # ignore actions after first correct submission + if correct: + break + + # ignore traces with no / only correct submissions + if not any(p[1] for p in programs) or all(p[1] for p in programs): + continue + + suggested = [] + for i, (program, correct) in enumerate(programs): + program_patterns = list(get_patterns(program)) + #for p in program_patterns: + # print(p[0]) + #print() + + # check if previously suggested rules match + for s in suggested: + s['passed'] += 1 + match = check_rule(s['rule'], program_patterns) + if (s['rule'].klass == 'T' and len(match) == len(s['rule'].condition) or + s['rule'].klass == 'F' and not match): + s['matched'].append(s['passed']) + + # only check programs until first correct submission + if correct: + print(str(i) + ' PASS\t' + program) + print() + break + + # check rules in order, buggy rules first + found = False + for rule in ( + [r for r in rules if r.klass == 'F'] + + [r for r in rules if r.klass == 'T']): + match = check_rule(rule, program_patterns) + if (rule.klass == 'F' and not match or + rule.klass == 'T' and len(match) != len(rule.condition)-1): + continue + found = True + + # store suggestion to see if it was implemented later + if not any(s['program'] == program and s['rule'] == rule for s in suggested): + # passed: how many submission before PASS + # matched: list of submissions where the suggested rule matched + # (the current submission has index 0, the next 1 and so on) + suggested.append({'program': program, 'rule': rule, 'found': i, 'passed': 0, 'matched': []}) + + # get highlights + highlight = set() + for m in match: + for n in m[1]: + highlight.add((n[0].pos, len(n[0].val), ('green' if rule.klass == 'T' else 'red'))) + + # print highighted program + print(str(i) + ' FAIL', end='\t') + color_print(program, list(highlight)) + + # print rule + for rule_pattern, yes in rule.condition: + if rule.klass == 'T': + if rule_pattern in [pattern for pattern, nodes in program_patterns]: + print('good\t' + str(rule_pattern)) + else: + print('missing\t' + str(rule_pattern)) + else: + if rule_pattern in [pattern for pattern, nodes in program_patterns]: + print('buggy\t' + str(rule_pattern)) + print() + break + + if not found: + print(str(i) + ' FAIL\t' + str(program)) + print() + unsuggestable[program] += 1 + + print('Suggestions and versions in which they were implemented:') + for s in suggested: + index = len(programs) - (s['passed'] + 1) + print(index, [index + m for m in s['matched']]) + all_suggestions += suggested + + print('-'*30) + print() + +# report +not_matched = [s for s in all_suggestions if s['passed'] not in s['matched']] +matched = [s for s in all_suggestions if s['passed'] in s['matched']] + +# rules that did / did not match in the solution +good = collections.Counter() +bad = collections.Counter() +for s in all_suggestions: + (good if s in matched else bad)[s['rule']] += 1 + +print('Statistics') +print('----------') +print('# of suggestions that were implemented:', len(matched)) +print('# of suggestions that were not implemented:', len(not_matched)) +print('avg. # of submissions before suggestion was implemented:', + sum(s['matched'][0] for s in matched)/len(matched)) +print('avg. # of submissions until PASS after suggestion was implemented:', + sum(s['passed'] - s['matched'][0] for s in matched)/len(matched)) +print('avg. # of submissions until PASS if suggestion was not implemented:', + sum(s['passed'] for s in not_matched)/len(not_matched)) +#print('avg. % of submissions after suggestion where it was not implemented :', 1-mean(len(s['matched'])/s['passed'] for s in matched)) +print() + +print('Unsuggestable programs') +print('----------------------') +for p, count in unsuggestable.most_common(): + print('{}\t{}'.format(count, p)) +print() + +print('Good rules') +print('----------') +for r, count in good.most_common(): + print('Suggested for ' + str(count) + ' submissions') + print(r) + +print('Bad rules') +print('---------') +for r, count in bad.most_common(): + print('Suggested for ' + str(count) + ' submissions') + print(r) -- cgit v1.2.1