4 files changed, 604 insertions, 0 deletions
diff --git a/prolog/__init__.py b/prolog/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/prolog/__init__.py
diff --git a/prolog/lexer.py b/prolog/lexer.py
new file mode 100644
index 0000000..5023e57
--- /dev/null
+++ b/prolog/lexer.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import ply.lex as lex
+
+# LEXER
+
+#states = (
+#    ('comment', 'exclusive'),
+#)
+
+# tokens; treat operators as names if followed by (
+operators = {
+    r':-':  'FROM',
+    r'-->':  'FROMDCG',
+    r'->':  'IMPLIES',
+    r'\+':  'NOT',
+    r'=':   'EQU',
+    r'\=':  'NEQU',
+    r'==':  'EQ',
+    r'\==': 'NEQ',
+    r'=..': 'UNIV',
+    r'is':  'IS',
+    r'=:=': 'EQA',
+    r'=\=': 'NEQA',
+    r'<':   'LT',
+    r'=<':  'LE',
+    r'>':   'GT',
+    r'>=':  'GE',
+    r'@<':  'LTL',
+    r'@=<': 'LEL',
+    r'@>':  'GTL',
+    r'@>=': 'GEL',
+    r'#=':  'EQFD',
+    r'#\=': 'NEQFD',
+    r'#<':  'LTFD',
+    r'#=<': 'LEFD',
+    r'#>':  'GTFD',
+    r'#>=': 'GEFD',
+    r'in':  'IN',
+    r'ins': 'INS',
+    r'..':  'THROUGH',
+    r'+':   'PLUS',
+    r'-':   'MINUS',
+    r'*':   'STAR',
+    r'/':   'DIV',
+    r'//':  'IDIV',
+    r'mod': 'MOD',
+    r'**':  'POW',
+    r'^':   'POW',
+    r'.':   'PERIOD',
+    r',':   'COMMA',
+    r';':   'SEMI'
+}
+tokens = sorted(list(operators.values())) + [
+    'UINTEGER', 'UREAL',
+    'NAME', 'VARIABLE', 'STRING',
+    'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'PIPE', 'LBRACE', 'RBRACE',
+    'INVALID'
+]
+
+# punctuation
+t_LBRACKET = r'\['
+t_RBRACKET = r'\]'
+t_LPAREN = r'\('
+t_RPAREN = r'\)'
+t_PIPE = r'\|'
+t_LBRACE = r'{'
+t_RBRACE = r'}'
+
+t_UINTEGER = r'[0-9]+'
+t_UREAL    = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan'
+t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*'
+t_STRING   = r'"(""|\\.|[^\"])*"'
+
+# no support for nested comments yet
+def t_comment(t):
+    r'(/\*(.|\n)*?\*/)|(%.*)'
+    pass
+
+def t_NAME(t):
+    r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|,"
+    if t.value == ',' or \
+       t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(':
+        t.type = operators.get(t.value, 'NAME')
+    return t
+
+t_ignore  = ' \t'
+
+def t_newline(t):
+    r'\n+'
+    t.lexer.lineno += len(t.value)
+
+def t_error(t):
+    # TODO send this to stderr
+    #print("Illegal character '" + t.value[0] + "'")
+    t.type = 'INVALID'
+    t.value = t.value[0]
+    t.lexer.skip(1)
+    return t
+
+lexer = lex.lex(errorlog=lex.NullLogger())
+
+if __name__ == '__main__':
+    while True:
+        try:
+            s = input('> ')
+        except EOFError:
+            break
+        if not s:
+            continue
+
+        lexer.input(s)
+        tokens = list(lexer)
+        print(tokens)
diff --git a/prolog/parser.py b/prolog/parser.py
new file mode 100644
index 0000000..4d4f9e9
--- /dev/null
+++ b/prolog/parser.py
@@ -0,0 +1,236 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from nltk import Tree
+import ply.yacc as yacc
+from .lexer import tokens
+from .util import Token
+
+# PARSER
+precedence = (
+    ('nonassoc', 'FROM', 'FROMDCG'),
+    ('right', 'PIPE'),
+    ('right', 'IMPLIES'),
+    ('right', 'NOT'),
+    ('nonassoc', 'EQU', 'NEQU', 'EQ', 'NEQ', 'UNIV', 'IS', 'EQA', 'NEQA', 'LT', 'LE', 'GT', 'GE', 'LTL', 'LEL', 'GTL', 'GEL', 'IN', 'INS', 'THROUGH', 'EQFD', 'NEQFD', 'LTFD', 'LEFD', 'GTFD', 'GEFD'),
+    ('left', 'PLUS', 'MINUS'),
+    ('left', 'STAR', 'DIV', 'IDIV', 'MOD'),
+    ('nonassoc', 'POW'),
+    ('right', 'UMINUS', 'UPLUS'),
+    ('nonassoc', 'UINTEGER', 'UREAL'),
+    ('nonassoc', 'NAME', 'VARIABLE', 'STRING'),
+    ('nonassoc', 'PERIOD'),
+    ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE')
+)
+
+def make_token(p, n):
+    lextoken = p.slice[n]
+    return Token(lextoken.type, lextoken.value, lextoken.lexpos)
+
+def p_text_empty(p):
+    'text : '
+    p[0] = Tree('text', [])
+def p_text_clause(p):
+    'text : text clause'
+    p[0] = p[1]
+    p[0].append(p[2])
+
+def p_clause_head(p):
+    'clause : head PERIOD'
+    p[0] = Tree('clause', [p[1]])
+def p_clause_rule(p):
+    '''clause : head FROM or PERIOD
+              | head FROMDCG or PERIOD'''
+    p[0] = Tree('clause', [p[1], p[3]])
+
+def p_head(p):
+    'head : term'
+    p[0] = Tree('head', [p[1]])
+
+def p_or_single(p):
+    'or : if'
+    p[0] = p[1]
+def p_or_if(p):
+    'or : or SEMI if'
+    p[0] = Tree('or', [p[1], p[3]])
+
+def p_if_single(p):
+    'if : and'
+    p[0] = p[1]
+def p_if_and(p):
+    'if : and IMPLIES if'
+    p[0] = Tree('if', [p[1], p[3]])
+
+def p_and_single(p):
+    'and : term'
+    p[0] = p[1]
+def p_and_term(p):
+    'and : term COMMA and'
+    p[0] = Tree('and', [p[1], p[3]])
+
+# Special case for zero-arity predicates supported by SWI-Prolog.
+def p_term_functor_zero(p):
+    'term : functor LPAREN RPAREN'
+    # No whitespace allowed between functor and LPAREN.
+    t2 = make_token(p, 2)
+    if p[1][0].pos + len(p[1][0].val) < t2.pos:
+        raise SyntaxError('whitespace before ' + str(t2))
+    p[0] = Tree('compound', [p[1]])
+def p_term_functor(p):
+    'term : functor LPAREN args RPAREN'
+    # No whitespace allowed between functor and LPAREN.
+    t2 = make_token(p, 2)
+    if p[1][0].pos + len(p[1][0].val) < t2.pos:
+        raise SyntaxError('whitespace before ' + str(t2))
+    p[0] = Tree('compound', [p[1], p[3]])
+
+def p_term_or(p):
+    'term : LPAREN or RPAREN'
+    p[0] = p[2]
+def p_term_binary(p):
+    '''term : term PLUS term
+            | term MINUS term
+            | term STAR term
+            | term POW term
+            | term DIV term
+            | term IDIV term
+            | term MOD term
+
+            | term EQU term
+            | term NEQU term
+            | term EQ term
+            | term NEQ term
+            | term UNIV term
+            | term IS term
+
+            | term EQA term
+            | term NEQA term
+            | term LT term
+            | term LE term
+            | term GT term
+            | term GE term
+
+            | term LTL term
+            | term LEL term
+            | term GTL term
+            | term GEL term
+
+            | term PIPE term
+            | term THROUGH term
+            | term IN term
+            | term INS term
+            | term EQFD term
+            | term NEQFD term
+            | term LTFD term
+            | term LEFD term
+            | term GTFD term
+            | term GEFD term'''
+    p[0] = Tree('binop', [p[1], make_token(p, 2), p[3]])
+def p_term_unary(p):
+    '''term : NOT term
+            | MINUS term %prec UMINUS
+            | PLUS term %prec UPLUS'''
+    p[0] = Tree('unop', [make_token(p, 1), p[2]])
+def p_term_list(p):
+    'term : list'
+    p[0] = p[1]
+
+def p_term_variable(p):
+    'term : VARIABLE'
+    p[0] = Tree('variable', [make_token(p, 1)])
+def p_term_simple(p):
+    '''term : STRING
+            | UINTEGER
+            | UREAL'''
+    p[0] = Tree('literal', [make_token(p, 1)])
+def p_term_name(p):
+    'term : NAME'
+    p[0] = make_token(p, 1)
+
+def p_term_clpr(p):
+    'term : LBRACE clpr RBRACE'
+    p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
+
+# compound term arguments
+def p_args_single(p):
+    'args : term'
+    p[0] = Tree('args', [p[1]])
+def p_args_multiple(p):
+    'args : term COMMA args'
+    p[0] = Tree('args', [p[1], p[3]])
+
+# list elements
+def p_elems_single(p):
+    'elems : term'
+    if isinstance(p[1], Tree) and p[1].label() == 'binop' and p[1][1].type == 'PIPE':
+        p[0] = Tree('list', [p[1][0]])
+        if p[1][2] != Tree('term', [Token(type='NIL', val='[]')]):
+            p[0].append(p[1][2])
+    else:
+        p[0] = Tree('list', [p[1], Token(type='NIL', val='[]')])
+def p_elems_multiple(p):
+    'elems : term COMMA elems'
+    p[0] = Tree('list', [p[1], p[3]])
+
+def p_list_empty(p):
+    'list : LBRACKET RBRACKET'
+    p[0] = Tree('literal', [Token(type='NIL', val='[]')])
+def p_list(p):
+    'list : LBRACKET elems RBRACKET'
+    p[0] = p[2]
+
+def p_functor(p):
+    'functor : NAME'
+    p[0] = Tree('functor', [make_token(p, 1)])
+
+# CLP(R) syntax
+def p_clpr_single(p):
+    'clpr : clpr_constr'
+    p[0] = Tree('clpr', [p[1]])
+def p_clpr_more(p):
+    '''clpr : clpr_constr COMMA clpr
+            | clpr_constr SEMI clpr'''
+    p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]])
+# XXX temporary until the new parser is in place, this also covers { } notation for DCGs
+def p_clpr_constr(p):
+    'clpr_constr : term'
+    p[0] = p[1]
+
+def p_error(t):
+    if t is None:
+        raise SyntaxError('unexpected end of file')
+    else:
+        raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value))
+
+parser = yacc.yacc(debug=False)
+
+if __name__ == '__main__':
+    from .util import stringify
+    while True:
+        try:
+            s = input('> ')
+        except EOFError:
+            break
+        if not s:
+            continue
+        ast = parser.parse(s)
+        def pp(node):
+            if isinstance(node, Tree):
+                return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')'
+            return '"' + str(node) + '"'
+        print(pp(ast))
+        #ast.pretty_print()
+        #print(stringify(ast))
diff --git a/prolog/util.py b/prolog/util.py
new file mode 100644
index 0000000..402b920
--- /dev/null
+++ b/prolog/util.py
@@ -0,0 +1,238 @@
+# CodeQ: an online programming tutor.
+# Copyright (C) 2015 UL FRI
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from collections import namedtuple
+from collections.abc import Iterable
+import string
+
+from nltk import Tree
+
+# Stores a token's type and value, and optionally the position of the first
+# character in the lexed stream.
+class Token(namedtuple('Token', ['type', 'val', 'pos'])):
+    __slots__ = ()
+
+    # Custom constructor to support default parameters.
+    def __new__(cls, type, val='', pos=None):
+        return super(Token, cls).__new__(cls, type, val, pos)
+
+    def __str__(self):
+        return str(self.val)
+
+    # Only consider type and value when comparing tokens. There is probably a
+    # cleaner way of doing this.
+    __eq__ = lambda x, y: x[0] == y[0] and x[1] == y[1]
+    __ne__ = lambda x, y: x[0] != y[0] or x[1] != y[1]
+    __lt__ = lambda x, y: tuple.__lt__(x[0:2], y[0:2])
+    __le__ = lambda x, y: tuple.__le__(x[0:2], y[0:2])
+    __ge__ = lambda x, y: tuple.__ge__(x[0:2], y[0:2])
+    __gt__ = lambda x, y: tuple.__gt__(x[0:2], y[0:2])
+
+    # Only hash token's value (we don't care about position, and types are
+    # determined by values).
+    def __hash__(self):
+        return hash(self[1])
+
+    # Return a copy of this token, possibly modifying some fields.
+    def clone(self, type=None, val=None, pos=None):
+        return Token(self.type if type is None else type,
+                     self.val if val is None else val,
+                     self.pos if pos is None else pos)
+
+from .lexer import lexer, operators
+from .parser import parser
+
+def parse(code):
+    try:
+        return parser.parse(code)
+    except SyntaxError:
+        return None
+
+# Return a list of tokens in [text].
+def tokenize(text):
+    lexer.input(text)
+    return [Token(t.type, t.value, t.lexpos) for t in lexer]
+
+# Return a one-line string representation of [obj] which may be a Tree or a
+# list of tokens.
+def stringify(obj):
+    if isinstance(obj, Token):
+        if obj.type in ('PERIOD', 'COMMA'):
+            return str(obj) + ' '
+        if obj.type in operators.values():
+            return ' ' + str(obj) + ' '
+        return str(obj)
+    if isinstance(obj, Iterable):
+        if isinstance(obj, Tree) and obj.label() == 'clause':
+            return ''.join([stringify(child) for child in obj]) + '\n'
+        return ''.join([stringify(child) for child in obj])
+
+# Return a canonical name for the [n]th variable in scope.
+def canonical_varname(n):
+    names = string.ascii_uppercase
+    if n < len(names):
+        return names[n]
+    return 'X{}'.format(n)
+
+# Rename variables in [tokens] to A0, A1, A2,… in order of appearance.
+def rename_vars_list(tokens, names=None):
+    if names is None:
+        names = {}
+    next_id = len(names)
+
+    # Return a new list.
+    tokens = list(tokens)
+    for i, t in enumerate(tokens):
+        if t.type == 'VARIABLE' and t.val != '_':
+            cur_name = t.val
+            if cur_name not in names:
+                names[cur_name] = canonical_varname(next_id)
+                next_id += 1
+            tokens[i] = t.clone(val=names[cur_name])
+    return tokens
+
+# Rename variables in AST rooted at [root] to A0, A1, A2,… in order of
+# appearance.
+def rename_vars_ast(root, fixed_names=None):
+    if fixed_names is None:
+        fixed_names = {}
+    names = {}
+    next_id = len(fixed_names) + len(names)
+
+    def rename_aux(node):
+        nonlocal fixed_names, names, next_id
+        if isinstance(node, Tree):
+            if node.label() == 'clause':
+                names = {}
+                next_id = len(fixed_names) + len(names)
+            new_children = [rename_aux(child) for child in node]
+            new_node = Tree(node.label(), new_children)
+        elif isinstance(node, Token):
+            if node.type == 'VARIABLE':
+                token = node
+                if token.val.startswith('_'):
+                    new_node = token.clone(val=canonical_varname(next_id))
+                    next_id += 1
+                else:
+                    cur_name = token.val
+                    if cur_name in fixed_names:
+                        new_name = fixed_names[cur_name]
+                    else:
+                        if cur_name not in names:
+                            names[cur_name] = canonical_varname(next_id)
+                            next_id += 1
+                        new_name = names[cur_name]
+                    new_node = token.clone(val=new_name)
+            else:
+                new_node = node
+        return new_node
+    return rename_aux(root)
+
+# Yield "interesting" parts of a Prolog AST as lists of tokens.
+def interesting_ranges(ast, path=()):
+    if ast.label() in {'clause', 'head', 'or', 'if', 'and'}:
+        if ast.label() == 'clause':
+            # Special case for clause with one goal.
+            if len(ast) == 4 and ast[2].label() == 'term':
+                terminals = ast[2].leaves()
+                yield terminals, path + (ast.label(), 'and')
+
+        if ast.label() == 'and':
+            for i in range(0, len(ast), 2):
+                for j in range(i, len(ast), 2):
+                    subs = ast[i:j+1]
+                    terminals = []
+                    for s in subs:
+                        terminals.extend([s] if isinstance(s, Token) else s.leaves())
+                    # We want at least some context.
+                    if len(terminals) > 1:
+                        yield terminals, path + (ast.label(),)
+        else:
+            terminals = ast.leaves()
+            # We want at least some context.
+            if len(terminals) > 1:
+                yield terminals, path + (ast.label(),)
+
+    for subtree in ast:
+        if isinstance(subtree, Tree):
+            yield from interesting_ranges(subtree, path + (ast.label(),))
+
+# Map "formal" variable names in the edit a→b to actual names in code [tokens].
+# The set [variables] contains all variable names in the current scope. These
+# are used in cases such as [A]→[A,B], where the edit introduces new variables.
+# Return a new version of b with actual variable names.
+def map_vars(a, b, tokens, variables):
+    mapping = {}
+    new_index = 0
+    for i in range(len(a)):
+        if tokens[i].type == 'VARIABLE':
+            formal_name = a[i].val
+            if tokens[i].val != '_':
+                actual_name = tokens[i].val
+            else:
+                actual_name = 'New'+str(new_index)
+                new_index += 1
+            mapping[formal_name] = actual_name
+
+    remaining_formal = [t.val for t in b if t.type == 'VARIABLE' and t.val not in mapping.keys()]
+    remaining_actual = [var for var in variables if var not in mapping.values()]
+
+    while len(remaining_actual) < len(remaining_formal):
+        remaining_actual.append('New'+str(new_index))
+        new_index += 1
+
+    for i, formal_name in enumerate(remaining_formal):
+        mapping[formal_name] = remaining_actual[i]
+
+    return [t if t.type != 'VARIABLE' else t.clone(val=mapping[t.val]) for t in b]
+
+# Return a set of predicate names (e.g. conc/3) used in [program].
+def used_predicates(program):
+    predicates = set()
+    def walk(tree, dcg=False):
+        if isinstance(tree, Tree):
+            # DCG predicates can be called without parameters
+            if tree.label() == 'clause' and len(tree) == 4 and \
+                    tree[1].type == 'FROMDCG':
+                dcg = True
+            if tree.label() == 'term' and len(tree) >= 3 and \
+                    isinstance(tree[0], Tree) and tree[0].label() == 'functor':
+                if len(tree) == 3:
+                    predicates.add('{}/0'.format(tree[0][0]))
+                else:
+                    predicates.add('{}/{}'.format(tree[0][0], (len(tree[2])+1)//2))
+            for subtree in tree:
+                walk(subtree, dcg)
+        elif isinstance(tree, Token):
+            if dcg and tree.type == 'NAME':
+                predicates.add('{}/{}'.format(tree.val, 2))
+                predicates.add('{}/{}'.format(tree.val, 3))
+                predicates.add('{}/{}'.format(tree.val, 4))
+    tree = parse(program)
+    if tree is not None:
+        walk(tree)
+    return predicates
+
+# Basic sanity check.
+if __name__ == '__main__':
+    var_names = {}
+    before = rename_vars(tokenize("dup([A0|A1], [A2|A3])"), var_names)
+    after = rename_vars(tokenize("dup([A0|A1], [A5, A4|A3])"), var_names)
+
+    line = lines[0]
+    variables = [t.val for t in tokenize(code) if t.type == 'VARIABLE']
+    mapped = map_vars(before, after, line, variables)
+    print(mapped)