diff options
Diffstat (limited to 'prolog')
-rw-r--r-- | prolog/__init__.py | 0 | ||||
-rw-r--r-- | prolog/lexer.py | 130 | ||||
-rw-r--r-- | prolog/parser.py | 236 | ||||
-rw-r--r-- | prolog/util.py | 238 |
4 files changed, 604 insertions, 0 deletions
diff --git a/prolog/__init__.py b/prolog/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/prolog/__init__.py diff --git a/prolog/lexer.py b/prolog/lexer.py new file mode 100644 index 0000000..5023e57 --- /dev/null +++ b/prolog/lexer.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 + +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import ply.lex as lex + +# LEXER + +#states = ( +# ('comment', 'exclusive'), +#) + +# tokens; treat operators as names if followed by ( +operators = { + r':-': 'FROM', + r'-->': 'FROMDCG', + r'->': 'IMPLIES', + r'\+': 'NOT', + r'=': 'EQU', + r'\=': 'NEQU', + r'==': 'EQ', + r'\==': 'NEQ', + r'=..': 'UNIV', + r'is': 'IS', + r'=:=': 'EQA', + r'=\=': 'NEQA', + r'<': 'LT', + r'=<': 'LE', + r'>': 'GT', + r'>=': 'GE', + r'@<': 'LTL', + r'@=<': 'LEL', + r'@>': 'GTL', + r'@>=': 'GEL', + r'#=': 'EQFD', + r'#\=': 'NEQFD', + r'#<': 'LTFD', + r'#=<': 'LEFD', + r'#>': 'GTFD', + r'#>=': 'GEFD', + r'in': 'IN', + r'ins': 'INS', + r'..': 'THROUGH', + r'+': 'PLUS', + r'-': 'MINUS', + r'*': 'STAR', + r'/': 'DIV', + r'//': 'IDIV', + r'mod': 'MOD', + r'**': 'POW', + r'^': 'POW', + r'.': 'PERIOD', + r',': 'COMMA', + r';': 'SEMI' +} +tokens = sorted(list(operators.values())) + [ + 'UINTEGER', 'UREAL', + 'NAME', 'VARIABLE', 'STRING', + 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'PIPE', 'LBRACE', 'RBRACE', + 'INVALID' +] + +# punctuation +t_LBRACKET = r'\[' +t_RBRACKET = r'\]' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_PIPE = r'\|' +t_LBRACE = r'{' +t_RBRACE = r'}' + +t_UINTEGER = r'[0-9]+' +t_UREAL = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan' +t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*' +t_STRING = r'"(""|\\.|[^\"])*"' + +# no support for nested comments yet +def t_comment(t): + r'(/\*(.|\n)*?\*/)|(%.*)' + pass + +def t_NAME(t): + r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|," + if t.value == ',' or \ + t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(': + t.type = operators.get(t.value, 'NAME') + return t + +t_ignore = ' \t' + +def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +def t_error(t): + # TODO send this to stderr + #print("Illegal character '" + t.value[0] + "'") + t.type = 'INVALID' + t.value = t.value[0] + t.lexer.skip(1) + return t + +lexer = lex.lex(errorlog=lex.NullLogger()) + +if __name__ == '__main__': + while True: + try: + s = input('> ') + except EOFError: + break + if not s: + continue + + lexer.input(s) + tokens = list(lexer) + print(tokens) diff --git a/prolog/parser.py b/prolog/parser.py new file mode 100644 index 0000000..4d4f9e9 --- /dev/null +++ b/prolog/parser.py @@ -0,0 +1,236 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from nltk import Tree +import ply.yacc as yacc +from .lexer import tokens +from .util import Token + +# PARSER +precedence = ( + ('nonassoc', 'FROM', 'FROMDCG'), + ('right', 'PIPE'), + ('right', 'IMPLIES'), + ('right', 'NOT'), + ('nonassoc', 'EQU', 'NEQU', 'EQ', 'NEQ', 'UNIV', 'IS', 'EQA', 'NEQA', 'LT', 'LE', 'GT', 'GE', 'LTL', 'LEL', 'GTL', 'GEL', 'IN', 'INS', 'THROUGH', 'EQFD', 'NEQFD', 'LTFD', 'LEFD', 'GTFD', 'GEFD'), + ('left', 'PLUS', 'MINUS'), + ('left', 'STAR', 'DIV', 'IDIV', 'MOD'), + ('nonassoc', 'POW'), + ('right', 'UMINUS', 'UPLUS'), + ('nonassoc', 'UINTEGER', 'UREAL'), + ('nonassoc', 'NAME', 'VARIABLE', 'STRING'), + ('nonassoc', 'PERIOD'), + ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE') +) + +def make_token(p, n): + lextoken = p.slice[n] + return Token(lextoken.type, lextoken.value, lextoken.lexpos) + +def p_text_empty(p): + 'text : ' + p[0] = Tree('text', []) +def p_text_clause(p): + 'text : text clause' + p[0] = p[1] + p[0].append(p[2]) + +def p_clause_head(p): + 'clause : head PERIOD' + p[0] = Tree('clause', [p[1]]) +def p_clause_rule(p): + '''clause : head FROM or PERIOD + | head FROMDCG or PERIOD''' + p[0] = Tree('clause', [p[1], p[3]]) + +def p_head(p): + 'head : term' + p[0] = Tree('head', [p[1]]) + +def p_or_single(p): + 'or : if' + p[0] = p[1] +def p_or_if(p): + 'or : or SEMI if' + p[0] = Tree('or', [p[1], p[3]]) + +def p_if_single(p): + 'if : and' + p[0] = p[1] +def p_if_and(p): + 'if : and IMPLIES if' + p[0] = Tree('if', [p[1], p[3]]) + +def p_and_single(p): + 'and : term' + p[0] = p[1] +def p_and_term(p): + 'and : term COMMA and' + p[0] = Tree('and', [p[1], p[3]]) + +# Special case for zero-arity predicates supported by SWI-Prolog. +def p_term_functor_zero(p): + 'term : functor LPAREN RPAREN' + # No whitespace allowed between functor and LPAREN. + t2 = make_token(p, 2) + if p[1][0].pos + len(p[1][0].val) < t2.pos: + raise SyntaxError('whitespace before ' + str(t2)) + p[0] = Tree('compound', [p[1]]) +def p_term_functor(p): + 'term : functor LPAREN args RPAREN' + # No whitespace allowed between functor and LPAREN. + t2 = make_token(p, 2) + if p[1][0].pos + len(p[1][0].val) < t2.pos: + raise SyntaxError('whitespace before ' + str(t2)) + p[0] = Tree('compound', [p[1], p[3]]) + +def p_term_or(p): + 'term : LPAREN or RPAREN' + p[0] = p[2] +def p_term_binary(p): + '''term : term PLUS term + | term MINUS term + | term STAR term + | term POW term + | term DIV term + | term IDIV term + | term MOD term + + | term EQU term + | term NEQU term + | term EQ term + | term NEQ term + | term UNIV term + | term IS term + + | term EQA term + | term NEQA term + | term LT term + | term LE term + | term GT term + | term GE term + + | term LTL term + | term LEL term + | term GTL term + | term GEL term + + | term PIPE term + | term THROUGH term + | term IN term + | term INS term + | term EQFD term + | term NEQFD term + | term LTFD term + | term LEFD term + | term GTFD term + | term GEFD term''' + p[0] = Tree('binop', [p[1], make_token(p, 2), p[3]]) +def p_term_unary(p): + '''term : NOT term + | MINUS term %prec UMINUS + | PLUS term %prec UPLUS''' + p[0] = Tree('unop', [make_token(p, 1), p[2]]) +def p_term_list(p): + 'term : list' + p[0] = p[1] + +def p_term_variable(p): + 'term : VARIABLE' + p[0] = Tree('variable', [make_token(p, 1)]) +def p_term_simple(p): + '''term : STRING + | UINTEGER + | UREAL''' + p[0] = Tree('literal', [make_token(p, 1)]) +def p_term_name(p): + 'term : NAME' + p[0] = make_token(p, 1) + +def p_term_clpr(p): + 'term : LBRACE clpr RBRACE' + p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)]) + +# compound term arguments +def p_args_single(p): + 'args : term' + p[0] = Tree('args', [p[1]]) +def p_args_multiple(p): + 'args : term COMMA args' + p[0] = Tree('args', [p[1], p[3]]) + +# list elements +def p_elems_single(p): + 'elems : term' + if isinstance(p[1], Tree) and p[1].label() == 'binop' and p[1][1].type == 'PIPE': + p[0] = Tree('list', [p[1][0]]) + if p[1][2] != Tree('term', [Token(type='NIL', val='[]')]): + p[0].append(p[1][2]) + else: + p[0] = Tree('list', [p[1], Token(type='NIL', val='[]')]) +def p_elems_multiple(p): + 'elems : term COMMA elems' + p[0] = Tree('list', [p[1], p[3]]) + +def p_list_empty(p): + 'list : LBRACKET RBRACKET' + p[0] = Tree('literal', [Token(type='NIL', val='[]')]) +def p_list(p): + 'list : LBRACKET elems RBRACKET' + p[0] = p[2] + +def p_functor(p): + 'functor : NAME' + p[0] = Tree('functor', [make_token(p, 1)]) + +# CLP(R) syntax +def p_clpr_single(p): + 'clpr : clpr_constr' + p[0] = Tree('clpr', [p[1]]) +def p_clpr_more(p): + '''clpr : clpr_constr COMMA clpr + | clpr_constr SEMI clpr''' + p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]]) +# XXX temporary until the new parser is in place, this also covers { } notation for DCGs +def p_clpr_constr(p): + 'clpr_constr : term' + p[0] = p[1] + +def p_error(t): + if t is None: + raise SyntaxError('unexpected end of file') + else: + raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value)) + +parser = yacc.yacc(debug=False) + +if __name__ == '__main__': + from .util import stringify + while True: + try: + s = input('> ') + except EOFError: + break + if not s: + continue + ast = parser.parse(s) + def pp(node): + if isinstance(node, Tree): + return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')' + return '"' + str(node) + '"' + print(pp(ast)) + #ast.pretty_print() + #print(stringify(ast)) diff --git a/prolog/util.py b/prolog/util.py new file mode 100644 index 0000000..402b920 --- /dev/null +++ b/prolog/util.py @@ -0,0 +1,238 @@ +# CodeQ: an online programming tutor. +# Copyright (C) 2015 UL FRI +# +# This program is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from collections import namedtuple +from collections.abc import Iterable +import string + +from nltk import Tree + +# Stores a token's type and value, and optionally the position of the first +# character in the lexed stream. +class Token(namedtuple('Token', ['type', 'val', 'pos'])): + __slots__ = () + + # Custom constructor to support default parameters. + def __new__(cls, type, val='', pos=None): + return super(Token, cls).__new__(cls, type, val, pos) + + def __str__(self): + return str(self.val) + + # Only consider type and value when comparing tokens. There is probably a + # cleaner way of doing this. + __eq__ = lambda x, y: x[0] == y[0] and x[1] == y[1] + __ne__ = lambda x, y: x[0] != y[0] or x[1] != y[1] + __lt__ = lambda x, y: tuple.__lt__(x[0:2], y[0:2]) + __le__ = lambda x, y: tuple.__le__(x[0:2], y[0:2]) + __ge__ = lambda x, y: tuple.__ge__(x[0:2], y[0:2]) + __gt__ = lambda x, y: tuple.__gt__(x[0:2], y[0:2]) + + # Only hash token's value (we don't care about position, and types are + # determined by values). + def __hash__(self): + return hash(self[1]) + + # Return a copy of this token, possibly modifying some fields. + def clone(self, type=None, val=None, pos=None): + return Token(self.type if type is None else type, + self.val if val is None else val, + self.pos if pos is None else pos) + +from .lexer import lexer, operators +from .parser import parser + +def parse(code): + try: + return parser.parse(code) + except SyntaxError: + return None + +# Return a list of tokens in [text]. +def tokenize(text): + lexer.input(text) + return [Token(t.type, t.value, t.lexpos) for t in lexer] + +# Return a one-line string representation of [obj] which may be a Tree or a +# list of tokens. +def stringify(obj): + if isinstance(obj, Token): + if obj.type in ('PERIOD', 'COMMA'): + return str(obj) + ' ' + if obj.type in operators.values(): + return ' ' + str(obj) + ' ' + return str(obj) + if isinstance(obj, Iterable): + if isinstance(obj, Tree) and obj.label() == 'clause': + return ''.join([stringify(child) for child in obj]) + '\n' + return ''.join([stringify(child) for child in obj]) + +# Return a canonical name for the [n]th variable in scope. +def canonical_varname(n): + names = string.ascii_uppercase + if n < len(names): + return names[n] + return 'X{}'.format(n) + +# Rename variables in [tokens] to A0, A1, A2,… in order of appearance. +def rename_vars_list(tokens, names=None): + if names is None: + names = {} + next_id = len(names) + + # Return a new list. + tokens = list(tokens) + for i, t in enumerate(tokens): + if t.type == 'VARIABLE' and t.val != '_': + cur_name = t.val + if cur_name not in names: + names[cur_name] = canonical_varname(next_id) + next_id += 1 + tokens[i] = t.clone(val=names[cur_name]) + return tokens + +# Rename variables in AST rooted at [root] to A0, A1, A2,… in order of +# appearance. +def rename_vars_ast(root, fixed_names=None): + if fixed_names is None: + fixed_names = {} + names = {} + next_id = len(fixed_names) + len(names) + + def rename_aux(node): + nonlocal fixed_names, names, next_id + if isinstance(node, Tree): + if node.label() == 'clause': + names = {} + next_id = len(fixed_names) + len(names) + new_children = [rename_aux(child) for child in node] + new_node = Tree(node.label(), new_children) + elif isinstance(node, Token): + if node.type == 'VARIABLE': + token = node + if token.val.startswith('_'): + new_node = token.clone(val=canonical_varname(next_id)) + next_id += 1 + else: + cur_name = token.val + if cur_name in fixed_names: + new_name = fixed_names[cur_name] + else: + if cur_name not in names: + names[cur_name] = canonical_varname(next_id) + next_id += 1 + new_name = names[cur_name] + new_node = token.clone(val=new_name) + else: + new_node = node + return new_node + return rename_aux(root) + +# Yield "interesting" parts of a Prolog AST as lists of tokens. +def interesting_ranges(ast, path=()): + if ast.label() in {'clause', 'head', 'or', 'if', 'and'}: + if ast.label() == 'clause': + # Special case for clause with one goal. + if len(ast) == 4 and ast[2].label() == 'term': + terminals = ast[2].leaves() + yield terminals, path + (ast.label(), 'and') + + if ast.label() == 'and': + for i in range(0, len(ast), 2): + for j in range(i, len(ast), 2): + subs = ast[i:j+1] + terminals = [] + for s in subs: + terminals.extend([s] if isinstance(s, Token) else s.leaves()) + # We want at least some context. + if len(terminals) > 1: + yield terminals, path + (ast.label(),) + else: + terminals = ast.leaves() + # We want at least some context. + if len(terminals) > 1: + yield terminals, path + (ast.label(),) + + for subtree in ast: + if isinstance(subtree, Tree): + yield from interesting_ranges(subtree, path + (ast.label(),)) + +# Map "formal" variable names in the edit a→b to actual names in code [tokens]. +# The set [variables] contains all variable names in the current scope. These +# are used in cases such as [A]→[A,B], where the edit introduces new variables. +# Return a new version of b with actual variable names. +def map_vars(a, b, tokens, variables): + mapping = {} + new_index = 0 + for i in range(len(a)): + if tokens[i].type == 'VARIABLE': + formal_name = a[i].val + if tokens[i].val != '_': + actual_name = tokens[i].val + else: + actual_name = 'New'+str(new_index) + new_index += 1 + mapping[formal_name] = actual_name + + remaining_formal = [t.val for t in b if t.type == 'VARIABLE' and t.val not in mapping.keys()] + remaining_actual = [var for var in variables if var not in mapping.values()] + + while len(remaining_actual) < len(remaining_formal): + remaining_actual.append('New'+str(new_index)) + new_index += 1 + + for i, formal_name in enumerate(remaining_formal): + mapping[formal_name] = remaining_actual[i] + + return [t if t.type != 'VARIABLE' else t.clone(val=mapping[t.val]) for t in b] + +# Return a set of predicate names (e.g. conc/3) used in [program]. +def used_predicates(program): + predicates = set() + def walk(tree, dcg=False): + if isinstance(tree, Tree): + # DCG predicates can be called without parameters + if tree.label() == 'clause' and len(tree) == 4 and \ + tree[1].type == 'FROMDCG': + dcg = True + if tree.label() == 'term' and len(tree) >= 3 and \ + isinstance(tree[0], Tree) and tree[0].label() == 'functor': + if len(tree) == 3: + predicates.add('{}/0'.format(tree[0][0])) + else: + predicates.add('{}/{}'.format(tree[0][0], (len(tree[2])+1)//2)) + for subtree in tree: + walk(subtree, dcg) + elif isinstance(tree, Token): + if dcg and tree.type == 'NAME': + predicates.add('{}/{}'.format(tree.val, 2)) + predicates.add('{}/{}'.format(tree.val, 3)) + predicates.add('{}/{}'.format(tree.val, 4)) + tree = parse(program) + if tree is not None: + walk(tree) + return predicates + +# Basic sanity check. +if __name__ == '__main__': + var_names = {} + before = rename_vars(tokenize("dup([A0|A1], [A2|A3])"), var_names) + after = rename_vars(tokenize("dup([A0|A1], [A5, A4|A3])"), var_names) + + line = lines[0] + variables = [t.val for t in tokenize(code) if t.type == 'VARIABLE'] + mapped = map_vars(before, after, line, variables) + print(mapped) |