summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimotej Lazar <timotej.lazar@fri.uni-lj.si>2017-02-23 16:06:46 +0100
committerTimotej Lazar <timotej.lazar@fri.uni-lj.si>2017-02-24 12:02:38 +0100
commit5ce873e545ecd77710c63dfa3f3bbb3d41fe3aa0 (patch)
tree7914a886977a6a0254c60bd5e0273c0787baf748
parent17d335a49f92b8eb45cf366aa235536aeefb5014 (diff)
Modify Prolog parser to produce ASTs instead of parse trees
-rw-r--r--prolog/lexer.py17
-rw-r--r--prolog/parser.py256
-rw-r--r--prolog/util.py31
3 files changed, 194 insertions, 110 deletions
diff --git a/prolog/lexer.py b/prolog/lexer.py
index f2b19dc..4e6c746 100644
--- a/prolog/lexer.py
+++ b/prolog/lexer.py
@@ -1,7 +1,7 @@
#!/usr/bin/python3
# CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2015-2017 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
@@ -18,19 +18,11 @@
import ply.lex as lex
-# LEXER
-
-#states = (
-# ('comment', 'exclusive'),
-#)
-
-# tokens; treat operators as names if followed by (
operators = {
r':-': 'FROM',
r'-->': 'FROMDCG',
r'->': 'IMPLIES',
r'\+': 'NOT',
- r'not': 'NOT',
r'=': 'EQU',
r'\=': 'NEQU',
r'==': 'EQ',
@@ -89,16 +81,15 @@ t_UREAL = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan'
t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*'
t_STRING = r'"(""|\\.|[^\"])*"'
-# no support for nested comments yet
+# TODO support nested comments
def t_comment(t):
r'(/\*(.|\n)*?\*/)|(%.*)'
pass
def t_NAME(t):
r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|,"
- if t.value == ',' or \
- t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(':
- t.type = operators.get(t.value, 'NAME')
+ # return appropriate tokens for names that are operators
+ t.type = operators.get(t.value, 'NAME')
return t
t_ignore = ' \t'
diff --git a/prolog/parser.py b/prolog/parser.py
index 1b0459b..599ecd1 100644
--- a/prolog/parser.py
+++ b/prolog/parser.py
@@ -1,5 +1,5 @@
# CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2015-2017 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
@@ -14,7 +14,7 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-from nltk import Tree
+from nltk import ParentedTree
import ply.yacc as yacc
from .lexer import tokens
from .util import Token
@@ -29,88 +29,124 @@ precedence = (
('left', 'PLUS', 'MINUS'),
('left', 'STAR', 'DIV', 'IDIV', 'MOD'),
('nonassoc', 'POW'),
- ('right', 'UMINUS', 'UPLUS'),
('nonassoc', 'UINTEGER', 'UREAL'),
('nonassoc', 'NAME', 'VARIABLE', 'STRING'),
('nonassoc', 'PERIOD'),
('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE')
)
+def token_start(p, n):
+ return p.slice[n].lexpos
+
+def token_end(p, n):
+ return p.slice[n].lexpos + len(p.slice[n].value)
+
def make_token(p, n):
lextoken = p.slice[n]
return Token(lextoken.type, lextoken.value, lextoken.lexpos)
+def make_tree(label, children, start=0, end=0):
+ tree = ParentedTree(label, children)
+ tree.start = start
+ tree.end = end
+ return tree
+
+# return a new tree labeled name with a single child p[n]
+def wrap(name, p, n=1):
+ if isinstance(p[n], ParentedTree):
+ return make_tree(name, [p[n]], p[n].start, p[n].end)
+ else:
+ return make_tree(name, [make_token(p, n)], token_start(p, n), token_end(p, n))
+
def p_text_empty(p):
'text : '
- p[0] = Tree('text', [])
+ p[0] = make_tree('text', [])
+
def p_text_clause(p):
'text : text clause'
p[0] = p[1]
p[0].append(p[2])
+ p[0].end = p[2].end
-def p_clause_head(p):
- 'clause : head PERIOD'
- p[0] = Tree('clause', [p[1], make_token(p, 2)])
+def p_clause_fact(p):
+ 'clause : or PERIOD'
+ p[0] = make_tree('clause', [wrap('head', p, 1)], p[1].start, token_end(p, 2))
def p_clause_rule(p):
- '''clause : head FROM or PERIOD
- | head FROMDCG or PERIOD'''
- p[0] = Tree('clause', [p[1], make_token(p, 2), p[3], make_token(p, 4)])
-
-def p_head(p):
- 'head : term'
- p[0] = Tree('head', [p[1]])
+ '''clause : or FROM or PERIOD
+ | or FROMDCG or PERIOD'''
+ p[0] = make_tree('clause', [wrap('head', p, 1), p[3]], p[1].start, token_end(p, 4))
+def p_clause_directive(p):
+ '''clause : FROM or PERIOD'''
+ p[0] = make_tree('directive', [p[2]], token_start(p, 1), token_end(p, 3))
def p_or_single(p):
'or : if'
p[0] = p[1]
def p_or_if(p):
'or : or SEMI if'
- if p[1].label() == 'or':
- p[0] = p[1]
- else:
- p[0] = Tree('or', [p[1]])
- p[0].append(make_token(p, 2))
- p[0].append(p[3])
+ p[0] = make_tree('or', [p[1], p[3]], p[1].start, p[3].end)
def p_if_single(p):
'if : and'
p[0] = p[1]
def p_if_and(p):
'if : and IMPLIES if'
- p[0] = Tree('if', [p[1], make_token(p, 2), p[3]])
+ p[0] = make_tree('if', [p[1], p[3]], p[1].start, p[3].end)
def p_and_single(p):
'and : term'
p[0] = p[1]
def p_and_term(p):
- 'and : and COMMA term'
- if p[1].label() == 'and':
- p[0] = p[1]
- else:
- p[0] = Tree('and', [p[1]])
- p[0].append(make_token(p, 2))
- p[0].append(p[3])
+ 'and : term COMMA and'
+ p[0] = make_tree('and', [p[1], p[3]], p[1].start, p[3].end)
+
+# simple terms
+def p_term_variable(p):
+ 'term : VARIABLE'
+ p[0] = wrap('variable', p)
+def p_term_simple(p):
+ '''term : NAME
+ | STRING
+ | UINTEGER
+ | UREAL'''
+ p[0] = wrap('literal', p)
-# Special case for zero-arity predicates supported by SWI-Prolog.
+# compound terms
def p_term_functor_zero(p):
'term : functor LPAREN RPAREN'
- # No whitespace allowed between functor and LPAREN.
- t2 = make_token(p, 2)
- if p[1][0].pos + len(p[1][0].val) < t2.pos:
- raise SyntaxError('whitespace before ' + str(t2))
- p[0] = Tree('term', [p[1], t2, make_token(p, 3)])
+ # special case for zero-arity predicates supported by SWI-Prolog
+ p[0] = make_tree('compound', [p[1]], p[1].start, token_end(p, 3))
def p_term_functor(p):
'term : functor LPAREN args RPAREN'
- # No whitespace allowed between functor and LPAREN.
- t2 = make_token(p, 2)
- if p[1][0].pos + len(p[1][0].val) < t2.pos:
- raise SyntaxError('whitespace before ' + str(t2))
- p[0] = Tree('term', [p[1], t2, p[3], make_token(p, 4)])
+ p[0] = make_tree('compound', [p[1], p[3]], p[1].start, token_end(p, 4))
+
+# compound term functor
+def p_functor(p):
+ 'functor : NAME'
+ p[0] = wrap('functor', p)
+# compound term arguments
+def p_args_single(p):
+ 'args : term'
+ p[0] = wrap('args', p)
+def p_args_multiple(p):
+ 'args : term COMMA args'
+ p[0] = make_tree('args', [p[1], p[3]], p[1].start, p[3].end)
+
+# covers parenthesized terms, e.g. “(a, b ; c)”
def p_term_or(p):
'term : LPAREN or RPAREN'
- p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
-def p_term_binary(p):
+ p[0] = p[2]
+ p[0].start = token_start(p, 1)
+ p[0].end = token_end(p, 3)
+
+# covers terms in braces used for CLP(R) and DCGs, e.g. “{ X = 1 ; X = 2}”
+def p_term_brace(p):
+ 'term : LBRACE or RBRACE'
+ p[0] = make_tree('braced', [p[2]], token_start(p, 1), token_end(p, 3))
+
+# binary expressions, e.g. “1 + 2”
+def p_term_operator_infix(p):
'''term : term PLUS term
| term MINUS term
| term STAR term
@@ -148,71 +184,98 @@ def p_term_binary(p):
| term LEFD term
| term GTFD term
| term GEFD term'''
- p[0] = Tree('term', [p[1], make_token(p, 2), p[3]])
-def p_term_unary(p):
- '''term : NOT term
- | MINUS term %prec UMINUS
- | PLUS term %prec UPLUS'''
- p[0] = Tree('term', [make_token(p, 1), p[2]])
-def p_term_list(p):
- 'term : list'
- p[0] = Tree('term', [p[1]])
+ p[0] = make_tree('binop', [p[1], make_token(p, 2), p[3]], p[1].start, p[3].end)
-def p_term_simple(p):
- '''term : STRING
- | NAME
- | UINTEGER
- | UREAL
- | VARIABLE'''
- p[0] = Tree('term', [make_token(p, 1)])
+# binary expressions in functional notation, e.g. “+(1,2)”
+def p_term_operator_prefix(p):
+ '''term : PLUS LPAREN term COMMA term RPAREN
+ | MINUS LPAREN term COMMA term RPAREN
+ | STAR LPAREN term COMMA term RPAREN
+ | POW LPAREN term COMMA term RPAREN
+ | DIV LPAREN term COMMA term RPAREN
+ | IDIV LPAREN term COMMA term RPAREN
+ | MOD LPAREN term COMMA term RPAREN
-def p_term_clpr(p):
- 'term : LBRACE clpr RBRACE'
- p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
+ | EQU LPAREN term COMMA term RPAREN
+ | NEQU LPAREN term COMMA term RPAREN
+ | EQ LPAREN term COMMA term RPAREN
+ | NEQ LPAREN term COMMA term RPAREN
+ | UNIV LPAREN term COMMA term RPAREN
+ | IS LPAREN term COMMA term RPAREN
-def p_args_single(p):
- 'args : term'
- p[0] = Tree('args', [p[1]])
-def p_args_term(p):
- 'args : args COMMA term'
- p[0] = p[1]
- p[0].append(make_token(p, 2))
- p[0].append(p[3])
+ | EQA LPAREN term COMMA term RPAREN
+ | NEQA LPAREN term COMMA term RPAREN
+ | LT LPAREN term COMMA term RPAREN
+ | LE LPAREN term COMMA term RPAREN
+ | GT LPAREN term COMMA term RPAREN
+ | GE LPAREN term COMMA term RPAREN
+
+ | LTL LPAREN term COMMA term RPAREN
+ | LEL LPAREN term COMMA term RPAREN
+ | GTL LPAREN term COMMA term RPAREN
+ | GEL LPAREN term COMMA term RPAREN
+
+ | PIPE LPAREN term COMMA term RPAREN
+ | THROUGH LPAREN term COMMA term RPAREN
+ | IN LPAREN term COMMA term RPAREN
+ | INS LPAREN term COMMA term RPAREN
+ | EQFD LPAREN term COMMA term RPAREN
+ | NEQFD LPAREN term COMMA term RPAREN
+ | LTFD LPAREN term COMMA term RPAREN
+ | LEFD LPAREN term COMMA term RPAREN
+ | GTFD LPAREN term COMMA term RPAREN
+ | GEFD LPAREN term COMMA term RPAREN'''
+ start, end = token_start(p, 1), token_end(p, 6)
+ p[0] = make_tree('binop', [p[3], make_token(p, 1), p[5]], start, end)
+# unary operators
+def p_term_operator_unary(p):
+ '''term : NOT term
+ | MINUS term
+ | PLUS term'''
+ # shift/reduce conflict for MINUS and PLUS with p_term_operator_prefix above:
+ # ply prefers shifting and will resolve +(2,2) to the binary expression “2+2”
+ # instead of the unary “+ (2,2)” (this behavior is what we want)
+ p[0] = make_tree('unop', [make_token(p, 1), p[2]], token_start(p, 1), p[2].end)
+
+# lists
+def p_term_list(p):
+ 'term : list'
+ p[0] = p[1]
def p_list_empty(p):
'list : LBRACKET RBRACKET'
- p[0] = Tree('list', [make_token(p, 1), make_token(p, 2)])
+ start, end = token_start(p, 1), token_end(p, 2)
+ p[0] = make_tree('literal', [Token(type='NIL', val='[]', pos=start)], start, end)
def p_list(p):
- 'list : LBRACKET args RBRACKET'
- p[0] = Tree('list', [make_token(p, 1)] + list(p[2]) + [make_token(p, 3)])
-def p_list_tail(p):
- 'list : LBRACKET args PIPE term RBRACKET'
- p[0] = Tree('list', [make_token(p, 1)] + list(p[2]) + [make_token(p, 3), p[4], make_token(p, 5)])
+ 'list : LBRACKET elems RBRACKET'
+ p[0] = p[2]
+ p[0].start = token_start(p, 1)
+ p[0].end = token_end(p, 3)
-def p_functor(p):
- 'functor : NAME'
- p[0] = Tree('functor', [make_token(p, 1)])
-
-# CLP(R) syntax
-def p_clpr_single(p):
- 'clpr : clpr_constr'
- p[0] = Tree('clpr', [p[1]])
-def p_clpr_more(p):
- '''clpr : clpr_constr COMMA clpr
- | clpr_constr SEMI clpr'''
- p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]])
-# XXX temporary until the new parser is in place, this also covers { } notation for DCGs
-def p_clpr_constr(p):
- 'clpr_constr : term'
- p[0] = p[1]
+# list elements
+def p_elems_single(p):
+ 'elems : term'
+ if p[1].label() == 'binop' and p[1][1].type == 'PIPE':
+ # p[1] has three children: left, "|", right
+ left = p[1].pop(0)
+ right = p[1].pop(1)
+ p[0] = make_tree('list', [left, right], p[1].start, right.end)
+ else:
+ # p[1] has one child: term
+ start, end = p[1].start, p[1].end
+ nil = make_tree('literal', [Token(type='NIL', val='[]', pos=start)], start, end)
+ p[0] = make_tree('list', [p[1], nil], start, end)
+def p_elems_multiple(p):
+ 'elems : term COMMA elems'
+ p[0] = make_tree('list', [p[1], p[3]], p[1].start, p[3].end)
def p_error(t):
- if t is None:
- raise SyntaxError('unexpected end of file')
+ if t is not None:
+ raise SyntaxError('at position {}: unexpected ‘{}’'.format(t.lexpos, t.value))
else:
- raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value))
+ raise SyntaxError('unexpected EOF')
-parser = yacc.yacc(debug=False)
+parser = yacc.yacc(debug=True)
if __name__ == '__main__':
from .util import stringify
@@ -224,4 +287,9 @@ if __name__ == '__main__':
if not s:
continue
ast = parser.parse(s)
+ def pp(node):
+ if isinstance(node, ParentedTree):
+ return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')'
+ return '"' + str(node) + '"'
+ print(pp(ast))
print(stringify(ast))
diff --git a/prolog/util.py b/prolog/util.py
index 402b920..d5c5050 100644
--- a/prolog/util.py
+++ b/prolog/util.py
@@ -1,5 +1,5 @@
# CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2015-2017 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
@@ -75,9 +75,34 @@ def stringify(obj):
if obj.type in operators.values():
return ' ' + str(obj) + ' '
return str(obj)
+
+ if isinstance(obj, Tree):
+ label = obj.label()
+ children = [stringify(child) for child in obj]
+ if label == 'text':
+ return ' '.join(children)
+ elif label == 'clause':
+ if len(children) == 1:
+ return children[0] + '.'
+ elif len(children) == 2:
+ return children[0] + ' :- ' + children[1] + '.'
+ elif label == 'directive':
+ return ':- ' + children[0] + '.'
+ elif label == 'or':
+ return ' ; '.join(children)
+ elif label == 'if':
+ return ' -> '.join(children)
+ elif label == 'and':
+ return ', '.join(children)
+ elif label == 'compound':
+ return children[0] + '(' + (children[1] if len(children) > 1 else '') + ')'
+ elif label == 'args':
+ return ','.join(children)
+ elif label == 'list':
+ # TODO pretty print
+ return '[' + '|'.join(children) + ']'
+
if isinstance(obj, Iterable):
- if isinstance(obj, Tree) and obj.label() == 'clause':
- return ''.join([stringify(child) for child in obj]) + '\n'
return ''.join([stringify(child) for child in obj])
# Return a canonical name for the [n]th variable in scope.