From 5ce873e545ecd77710c63dfa3f3bbb3d41fe3aa0 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Thu, 23 Feb 2017 16:06:46 +0100 Subject: Modify Prolog parser to produce ASTs instead of parse trees --- prolog/lexer.py | 17 +--- prolog/parser.py | 256 +++++++++++++++++++++++++++++++++++-------------------- prolog/util.py | 31 ++++++- 3 files changed, 194 insertions(+), 110 deletions(-) diff --git a/prolog/lexer.py b/prolog/lexer.py index f2b19dc..4e6c746 100644 --- a/prolog/lexer.py +++ b/prolog/lexer.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # CodeQ: an online programming tutor. -# Copyright (C) 2015 UL FRI +# Copyright (C) 2015-2017 UL FRI # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free @@ -18,19 +18,11 @@ import ply.lex as lex -# LEXER - -#states = ( -# ('comment', 'exclusive'), -#) - -# tokens; treat operators as names if followed by ( operators = { r':-': 'FROM', r'-->': 'FROMDCG', r'->': 'IMPLIES', r'\+': 'NOT', - r'not': 'NOT', r'=': 'EQU', r'\=': 'NEQU', r'==': 'EQ', @@ -89,16 +81,15 @@ t_UREAL = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan' t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*' t_STRING = r'"(""|\\.|[^\"])*"' -# no support for nested comments yet +# TODO support nested comments def t_comment(t): r'(/\*(.|\n)*?\*/)|(%.*)' pass def t_NAME(t): r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|," - if t.value == ',' or \ - t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(': - t.type = operators.get(t.value, 'NAME') + # return appropriate tokens for names that are operators + t.type = operators.get(t.value, 'NAME') return t t_ignore = ' \t' diff --git a/prolog/parser.py b/prolog/parser.py index 1b0459b..599ecd1 100644 --- a/prolog/parser.py +++ b/prolog/parser.py @@ -1,5 +1,5 @@ # CodeQ: an online programming tutor. -# Copyright (C) 2015 UL FRI +# Copyright (C) 2015-2017 UL FRI # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free @@ -14,7 +14,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from nltk import Tree +from nltk import ParentedTree import ply.yacc as yacc from .lexer import tokens from .util import Token @@ -29,88 +29,124 @@ precedence = ( ('left', 'PLUS', 'MINUS'), ('left', 'STAR', 'DIV', 'IDIV', 'MOD'), ('nonassoc', 'POW'), - ('right', 'UMINUS', 'UPLUS'), ('nonassoc', 'UINTEGER', 'UREAL'), ('nonassoc', 'NAME', 'VARIABLE', 'STRING'), ('nonassoc', 'PERIOD'), ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE') ) +def token_start(p, n): + return p.slice[n].lexpos + +def token_end(p, n): + return p.slice[n].lexpos + len(p.slice[n].value) + def make_token(p, n): lextoken = p.slice[n] return Token(lextoken.type, lextoken.value, lextoken.lexpos) +def make_tree(label, children, start=0, end=0): + tree = ParentedTree(label, children) + tree.start = start + tree.end = end + return tree + +# return a new tree labeled name with a single child p[n] +def wrap(name, p, n=1): + if isinstance(p[n], ParentedTree): + return make_tree(name, [p[n]], p[n].start, p[n].end) + else: + return make_tree(name, [make_token(p, n)], token_start(p, n), token_end(p, n)) + def p_text_empty(p): 'text : ' - p[0] = Tree('text', []) + p[0] = make_tree('text', []) + def p_text_clause(p): 'text : text clause' p[0] = p[1] p[0].append(p[2]) + p[0].end = p[2].end -def p_clause_head(p): - 'clause : head PERIOD' - p[0] = Tree('clause', [p[1], make_token(p, 2)]) +def p_clause_fact(p): + 'clause : or PERIOD' + p[0] = make_tree('clause', [wrap('head', p, 1)], p[1].start, token_end(p, 2)) def p_clause_rule(p): - '''clause : head FROM or PERIOD - | head FROMDCG or PERIOD''' - p[0] = Tree('clause', [p[1], make_token(p, 2), p[3], make_token(p, 4)]) - -def p_head(p): - 'head : term' - p[0] = Tree('head', [p[1]]) + '''clause : or FROM or PERIOD + | or FROMDCG or PERIOD''' + p[0] = make_tree('clause', [wrap('head', p, 1), p[3]], p[1].start, token_end(p, 4)) +def p_clause_directive(p): + '''clause : FROM or PERIOD''' + p[0] = make_tree('directive', [p[2]], token_start(p, 1), token_end(p, 3)) def p_or_single(p): 'or : if' p[0] = p[1] def p_or_if(p): 'or : or SEMI if' - if p[1].label() == 'or': - p[0] = p[1] - else: - p[0] = Tree('or', [p[1]]) - p[0].append(make_token(p, 2)) - p[0].append(p[3]) + p[0] = make_tree('or', [p[1], p[3]], p[1].start, p[3].end) def p_if_single(p): 'if : and' p[0] = p[1] def p_if_and(p): 'if : and IMPLIES if' - p[0] = Tree('if', [p[1], make_token(p, 2), p[3]]) + p[0] = make_tree('if', [p[1], p[3]], p[1].start, p[3].end) def p_and_single(p): 'and : term' p[0] = p[1] def p_and_term(p): - 'and : and COMMA term' - if p[1].label() == 'and': - p[0] = p[1] - else: - p[0] = Tree('and', [p[1]]) - p[0].append(make_token(p, 2)) - p[0].append(p[3]) + 'and : term COMMA and' + p[0] = make_tree('and', [p[1], p[3]], p[1].start, p[3].end) + +# simple terms +def p_term_variable(p): + 'term : VARIABLE' + p[0] = wrap('variable', p) +def p_term_simple(p): + '''term : NAME + | STRING + | UINTEGER + | UREAL''' + p[0] = wrap('literal', p) -# Special case for zero-arity predicates supported by SWI-Prolog. +# compound terms def p_term_functor_zero(p): 'term : functor LPAREN RPAREN' - # No whitespace allowed between functor and LPAREN. - t2 = make_token(p, 2) - if p[1][0].pos + len(p[1][0].val) < t2.pos: - raise SyntaxError('whitespace before ' + str(t2)) - p[0] = Tree('term', [p[1], t2, make_token(p, 3)]) + # special case for zero-arity predicates supported by SWI-Prolog + p[0] = make_tree('compound', [p[1]], p[1].start, token_end(p, 3)) def p_term_functor(p): 'term : functor LPAREN args RPAREN' - # No whitespace allowed between functor and LPAREN. - t2 = make_token(p, 2) - if p[1][0].pos + len(p[1][0].val) < t2.pos: - raise SyntaxError('whitespace before ' + str(t2)) - p[0] = Tree('term', [p[1], t2, p[3], make_token(p, 4)]) + p[0] = make_tree('compound', [p[1], p[3]], p[1].start, token_end(p, 4)) + +# compound term functor +def p_functor(p): + 'functor : NAME' + p[0] = wrap('functor', p) +# compound term arguments +def p_args_single(p): + 'args : term' + p[0] = wrap('args', p) +def p_args_multiple(p): + 'args : term COMMA args' + p[0] = make_tree('args', [p[1], p[3]], p[1].start, p[3].end) + +# covers parenthesized terms, e.g. “(a, b ; c)” def p_term_or(p): 'term : LPAREN or RPAREN' - p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)]) -def p_term_binary(p): + p[0] = p[2] + p[0].start = token_start(p, 1) + p[0].end = token_end(p, 3) + +# covers terms in braces used for CLP(R) and DCGs, e.g. “{ X = 1 ; X = 2}” +def p_term_brace(p): + 'term : LBRACE or RBRACE' + p[0] = make_tree('braced', [p[2]], token_start(p, 1), token_end(p, 3)) + +# binary expressions, e.g. “1 + 2” +def p_term_operator_infix(p): '''term : term PLUS term | term MINUS term | term STAR term @@ -148,71 +184,98 @@ def p_term_binary(p): | term LEFD term | term GTFD term | term GEFD term''' - p[0] = Tree('term', [p[1], make_token(p, 2), p[3]]) -def p_term_unary(p): - '''term : NOT term - | MINUS term %prec UMINUS - | PLUS term %prec UPLUS''' - p[0] = Tree('term', [make_token(p, 1), p[2]]) -def p_term_list(p): - 'term : list' - p[0] = Tree('term', [p[1]]) + p[0] = make_tree('binop', [p[1], make_token(p, 2), p[3]], p[1].start, p[3].end) -def p_term_simple(p): - '''term : STRING - | NAME - | UINTEGER - | UREAL - | VARIABLE''' - p[0] = Tree('term', [make_token(p, 1)]) +# binary expressions in functional notation, e.g. “+(1,2)” +def p_term_operator_prefix(p): + '''term : PLUS LPAREN term COMMA term RPAREN + | MINUS LPAREN term COMMA term RPAREN + | STAR LPAREN term COMMA term RPAREN + | POW LPAREN term COMMA term RPAREN + | DIV LPAREN term COMMA term RPAREN + | IDIV LPAREN term COMMA term RPAREN + | MOD LPAREN term COMMA term RPAREN -def p_term_clpr(p): - 'term : LBRACE clpr RBRACE' - p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)]) + | EQU LPAREN term COMMA term RPAREN + | NEQU LPAREN term COMMA term RPAREN + | EQ LPAREN term COMMA term RPAREN + | NEQ LPAREN term COMMA term RPAREN + | UNIV LPAREN term COMMA term RPAREN + | IS LPAREN term COMMA term RPAREN -def p_args_single(p): - 'args : term' - p[0] = Tree('args', [p[1]]) -def p_args_term(p): - 'args : args COMMA term' - p[0] = p[1] - p[0].append(make_token(p, 2)) - p[0].append(p[3]) + | EQA LPAREN term COMMA term RPAREN + | NEQA LPAREN term COMMA term RPAREN + | LT LPAREN term COMMA term RPAREN + | LE LPAREN term COMMA term RPAREN + | GT LPAREN term COMMA term RPAREN + | GE LPAREN term COMMA term RPAREN + + | LTL LPAREN term COMMA term RPAREN + | LEL LPAREN term COMMA term RPAREN + | GTL LPAREN term COMMA term RPAREN + | GEL LPAREN term COMMA term RPAREN + + | PIPE LPAREN term COMMA term RPAREN + | THROUGH LPAREN term COMMA term RPAREN + | IN LPAREN term COMMA term RPAREN + | INS LPAREN term COMMA term RPAREN + | EQFD LPAREN term COMMA term RPAREN + | NEQFD LPAREN term COMMA term RPAREN + | LTFD LPAREN term COMMA term RPAREN + | LEFD LPAREN term COMMA term RPAREN + | GTFD LPAREN term COMMA term RPAREN + | GEFD LPAREN term COMMA term RPAREN''' + start, end = token_start(p, 1), token_end(p, 6) + p[0] = make_tree('binop', [p[3], make_token(p, 1), p[5]], start, end) +# unary operators +def p_term_operator_unary(p): + '''term : NOT term + | MINUS term + | PLUS term''' + # shift/reduce conflict for MINUS and PLUS with p_term_operator_prefix above: + # ply prefers shifting and will resolve +(2,2) to the binary expression “2+2” + # instead of the unary “+ (2,2)” (this behavior is what we want) + p[0] = make_tree('unop', [make_token(p, 1), p[2]], token_start(p, 1), p[2].end) + +# lists +def p_term_list(p): + 'term : list' + p[0] = p[1] def p_list_empty(p): 'list : LBRACKET RBRACKET' - p[0] = Tree('list', [make_token(p, 1), make_token(p, 2)]) + start, end = token_start(p, 1), token_end(p, 2) + p[0] = make_tree('literal', [Token(type='NIL', val='[]', pos=start)], start, end) def p_list(p): - 'list : LBRACKET args RBRACKET' - p[0] = Tree('list', [make_token(p, 1)] + list(p[2]) + [make_token(p, 3)]) -def p_list_tail(p): - 'list : LBRACKET args PIPE term RBRACKET' - p[0] = Tree('list', [make_token(p, 1)] + list(p[2]) + [make_token(p, 3), p[4], make_token(p, 5)]) + 'list : LBRACKET elems RBRACKET' + p[0] = p[2] + p[0].start = token_start(p, 1) + p[0].end = token_end(p, 3) -def p_functor(p): - 'functor : NAME' - p[0] = Tree('functor', [make_token(p, 1)]) - -# CLP(R) syntax -def p_clpr_single(p): - 'clpr : clpr_constr' - p[0] = Tree('clpr', [p[1]]) -def p_clpr_more(p): - '''clpr : clpr_constr COMMA clpr - | clpr_constr SEMI clpr''' - p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]]) -# XXX temporary until the new parser is in place, this also covers { } notation for DCGs -def p_clpr_constr(p): - 'clpr_constr : term' - p[0] = p[1] +# list elements +def p_elems_single(p): + 'elems : term' + if p[1].label() == 'binop' and p[1][1].type == 'PIPE': + # p[1] has three children: left, "|", right + left = p[1].pop(0) + right = p[1].pop(1) + p[0] = make_tree('list', [left, right], p[1].start, right.end) + else: + # p[1] has one child: term + start, end = p[1].start, p[1].end + nil = make_tree('literal', [Token(type='NIL', val='[]', pos=start)], start, end) + p[0] = make_tree('list', [p[1], nil], start, end) +def p_elems_multiple(p): + 'elems : term COMMA elems' + p[0] = make_tree('list', [p[1], p[3]], p[1].start, p[3].end) def p_error(t): - if t is None: - raise SyntaxError('unexpected end of file') + if t is not None: + raise SyntaxError('at position {}: unexpected ‘{}’'.format(t.lexpos, t.value)) else: - raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value)) + raise SyntaxError('unexpected EOF') -parser = yacc.yacc(debug=False) +parser = yacc.yacc(debug=True) if __name__ == '__main__': from .util import stringify @@ -224,4 +287,9 @@ if __name__ == '__main__': if not s: continue ast = parser.parse(s) + def pp(node): + if isinstance(node, ParentedTree): + return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')' + return '"' + str(node) + '"' + print(pp(ast)) print(stringify(ast)) diff --git a/prolog/util.py b/prolog/util.py index 402b920..d5c5050 100644 --- a/prolog/util.py +++ b/prolog/util.py @@ -1,5 +1,5 @@ # CodeQ: an online programming tutor. -# Copyright (C) 2015 UL FRI +# Copyright (C) 2015-2017 UL FRI # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free @@ -75,9 +75,34 @@ def stringify(obj): if obj.type in operators.values(): return ' ' + str(obj) + ' ' return str(obj) + + if isinstance(obj, Tree): + label = obj.label() + children = [stringify(child) for child in obj] + if label == 'text': + return ' '.join(children) + elif label == 'clause': + if len(children) == 1: + return children[0] + '.' + elif len(children) == 2: + return children[0] + ' :- ' + children[1] + '.' + elif label == 'directive': + return ':- ' + children[0] + '.' + elif label == 'or': + return ' ; '.join(children) + elif label == 'if': + return ' -> '.join(children) + elif label == 'and': + return ', '.join(children) + elif label == 'compound': + return children[0] + '(' + (children[1] if len(children) > 1 else '') + ')' + elif label == 'args': + return ','.join(children) + elif label == 'list': + # TODO pretty print + return '[' + '|'.join(children) + ']' + if isinstance(obj, Iterable): - if isinstance(obj, Tree) and obj.label() == 'clause': - return ''.join([stringify(child) for child in obj]) + '\n' return ''.join([stringify(child) for child in obj]) # Return a canonical name for the [n]th variable in scope. -- cgit v1.2.1