Modify Prolog parser to produce ASTs instead of parse trees

author: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2017-02-23 16:06:46 +0100
committer: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2017-02-24 12:02:38 +0100
commit: 5ce873e545ecd77710c63dfa3f3bbb3d41fe3aa0 (patch)
tree: 7914a886977a6a0254c60bd5e0273c0787baf748
parent: 17d335a49f92b8eb45cf366aa235536aeefb5014 (diff)
3 files changed, 194 insertions, 110 deletions
diff --git a/prolog/lexer.py b/prolog/lexer.py
index f2b19dc..4e6c746 100644
--- a/prolog/lexer.py
+++ b/prolog/lexer.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 
 # CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2015-2017 UL FRI
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
@@ -18,19 +18,11 @@
 
 import ply.lex as lex
 
-# LEXER
-
-#states = (
-#    ('comment', 'exclusive'),
-#)
-
-# tokens; treat operators as names if followed by (
 operators = {
     r':-':  'FROM',
     r'-->':  'FROMDCG',
     r'->':  'IMPLIES',
     r'\+':  'NOT',
-    r'not': 'NOT',
     r'=':   'EQU',
     r'\=':  'NEQU',
     r'==':  'EQ',
@@ -89,16 +81,15 @@ t_UREAL    = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan'
 t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*'
 t_STRING   = r'"(""|\\.|[^\"])*"'
 
-# no support for nested comments yet
+# TODO support nested comments
 def t_comment(t):
     r'(/\*(.|\n)*?\*/)|(%.*)'
     pass
 
 def t_NAME(t):
     r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|,"
-    if t.value == ',' or \
-       t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(':
-        t.type = operators.get(t.value, 'NAME')
+    # return appropriate tokens for names that are operators
+    t.type = operators.get(t.value, 'NAME')
     return t
 
 t_ignore  = ' \t'
diff --git a/prolog/parser.py b/prolog/parser.py
index 1b0459b..599ecd1 100644
--- a/prolog/parser.py
+++ b/prolog/parser.py
@@ -1,5 +1,5 @@
 # CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2015-2017 UL FRI
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
@@ -14,7 +14,7 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-from nltk import Tree
+from nltk import ParentedTree
 import ply.yacc as yacc
 from .lexer import tokens
 from .util import Token
@@ -29,88 +29,124 @@ precedence = (
     ('left', 'PLUS', 'MINUS'),
     ('left', 'STAR', 'DIV', 'IDIV', 'MOD'),
     ('nonassoc', 'POW'),
-    ('right', 'UMINUS', 'UPLUS'),
     ('nonassoc', 'UINTEGER', 'UREAL'),
     ('nonassoc', 'NAME', 'VARIABLE', 'STRING'),
     ('nonassoc', 'PERIOD'),
     ('nonassoc', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'COMMA', 'SEMI', 'LBRACE', 'RBRACE')
 )
 
+def token_start(p, n):
+    return p.slice[n].lexpos
+
+def token_end(p, n):
+    return p.slice[n].lexpos + len(p.slice[n].value)
+
 def make_token(p, n):
     lextoken = p.slice[n]
     return Token(lextoken.type, lextoken.value, lextoken.lexpos)
 
+def make_tree(label, children, start=0, end=0):
+    tree = ParentedTree(label, children)
+    tree.start = start
+    tree.end = end
+    return tree
+
+# return a new tree labeled name with a single child p[n]
+def wrap(name, p, n=1):
+    if isinstance(p[n], ParentedTree):
+        return make_tree(name, [p[n]], p[n].start, p[n].end)
+    else:
+        return make_tree(name, [make_token(p, n)], token_start(p, n), token_end(p, n))
+
 def p_text_empty(p):
     'text : '
-    p[0] = Tree('text', [])
+    p[0] = make_tree('text', [])
+
 def p_text_clause(p):
     'text : text clause'
     p[0] = p[1]
     p[0].append(p[2])
+    p[0].end = p[2].end
 
-def p_clause_head(p):
-    'clause : head PERIOD'
-    p[0] = Tree('clause', [p[1], make_token(p, 2)])
+def p_clause_fact(p):
+    'clause : or PERIOD'
+    p[0] = make_tree('clause', [wrap('head', p, 1)], p[1].start, token_end(p, 2))
 def p_clause_rule(p):
-    '''clause : head FROM or PERIOD
-              | head FROMDCG or PERIOD'''
-    p[0] = Tree('clause', [p[1], make_token(p, 2), p[3], make_token(p, 4)])
-
-def p_head(p):
-    'head : term'
-    p[0] = Tree('head', [p[1]])
+    '''clause : or FROM or PERIOD
+            | or FROMDCG or PERIOD'''
+    p[0] = make_tree('clause', [wrap('head', p, 1), p[3]], p[1].start, token_end(p, 4))
+def p_clause_directive(p):
+    '''clause : FROM or PERIOD'''
+    p[0] = make_tree('directive', [p[2]], token_start(p, 1), token_end(p, 3))
 
 def p_or_single(p):
     'or : if'
     p[0] = p[1]
 def p_or_if(p):
     'or : or SEMI if'
-    if p[1].label() == 'or':
-        p[0] = p[1]
-    else:
-        p[0] = Tree('or', [p[1]])
-    p[0].append(make_token(p, 2))
-    p[0].append(p[3])
+    p[0] = make_tree('or', [p[1], p[3]], p[1].start, p[3].end)
 
 def p_if_single(p):
     'if : and'
     p[0] = p[1]
 def p_if_and(p):
     'if : and IMPLIES if'
-    p[0] = Tree('if', [p[1], make_token(p, 2), p[3]])
+    p[0] = make_tree('if', [p[1], p[3]], p[1].start, p[3].end)
 
 def p_and_single(p):
     'and : term'
     p[0] = p[1]
 def p_and_term(p):
-    'and : and COMMA term'
-    if p[1].label() == 'and':
-        p[0] = p[1]
-    else:
-        p[0] = Tree('and', [p[1]])
-    p[0].append(make_token(p, 2))
-    p[0].append(p[3])
+    'and : term COMMA and'
+    p[0] = make_tree('and', [p[1], p[3]], p[1].start, p[3].end)
+
+# simple terms
+def p_term_variable(p):
+    'term : VARIABLE'
+    p[0] = wrap('variable', p)
+def p_term_simple(p):
+    '''term : NAME
+            | STRING
+            | UINTEGER
+            | UREAL'''
+    p[0] = wrap('literal', p)
 
-# Special case for zero-arity predicates supported by SWI-Prolog.
+# compound terms
 def p_term_functor_zero(p):
     'term : functor LPAREN RPAREN'
-    # No whitespace allowed between functor and LPAREN.
-    t2 = make_token(p, 2)
-    if p[1][0].pos + len(p[1][0].val) < t2.pos:
-        raise SyntaxError('whitespace before ' + str(t2))
-    p[0] = Tree('term', [p[1], t2, make_token(p, 3)])
+    # special case for zero-arity predicates supported by SWI-Prolog
+    p[0] = make_tree('compound', [p[1]], p[1].start, token_end(p, 3))
 def p_term_functor(p):
     'term : functor LPAREN args RPAREN'
-    # No whitespace allowed between functor and LPAREN.
-    t2 = make_token(p, 2)
-    if p[1][0].pos + len(p[1][0].val) < t2.pos:
-        raise SyntaxError('whitespace before ' + str(t2))
-    p[0] = Tree('term', [p[1], t2, p[3], make_token(p, 4)])
+    p[0] = make_tree('compound', [p[1], p[3]], p[1].start, token_end(p, 4))
+
+# compound term functor
+def p_functor(p):
+    'functor : NAME'
+    p[0] = wrap('functor', p)
 
+# compound term arguments
+def p_args_single(p):
+    'args : term'
+    p[0] = wrap('args', p)
+def p_args_multiple(p):
+    'args : term COMMA args'
+    p[0] = make_tree('args', [p[1], p[3]], p[1].start, p[3].end)
+
+# covers parenthesized terms, e.g. “(a, b ; c)”
 def p_term_or(p):
     'term : LPAREN or RPAREN'
-    p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
-def p_term_binary(p):
+    p[0] = p[2]
+    p[0].start = token_start(p, 1)
+    p[0].end = token_end(p, 3)
+
+# covers terms in braces used for CLP(R) and DCGs, e.g. “{ X = 1 ; X = 2}”
+def p_term_brace(p):
+    'term : LBRACE or RBRACE'
+    p[0] = make_tree('braced', [p[2]], token_start(p, 1), token_end(p, 3))
+
+# binary expressions, e.g. “1 + 2”
+def p_term_operator_infix(p):
     '''term : term PLUS term
             | term MINUS term
             | term STAR term
@@ -148,71 +184,98 @@ def p_term_binary(p):
             | term LEFD term
             | term GTFD term
             | term GEFD term'''
-    p[0] = Tree('term', [p[1], make_token(p, 2), p[3]])
-def p_term_unary(p):
-    '''term : NOT term
-            | MINUS term %prec UMINUS
-            | PLUS term %prec UPLUS'''
-    p[0] = Tree('term', [make_token(p, 1), p[2]])
-def p_term_list(p):
-    'term : list'
-    p[0] = Tree('term', [p[1]])
+    p[0] = make_tree('binop', [p[1], make_token(p, 2), p[3]], p[1].start, p[3].end)
 
-def p_term_simple(p):
-    '''term : STRING
-            | NAME
-            | UINTEGER
-            | UREAL
-            | VARIABLE'''
-    p[0] = Tree('term', [make_token(p, 1)])
+# binary expressions in functional notation, e.g. “+(1,2)”
+def p_term_operator_prefix(p):
+    '''term : PLUS LPAREN term COMMA term RPAREN
+            | MINUS LPAREN term COMMA term RPAREN
+            | STAR LPAREN term COMMA term RPAREN
+            | POW LPAREN term COMMA term RPAREN
+            | DIV LPAREN term COMMA term RPAREN
+            | IDIV LPAREN term COMMA term RPAREN
+            | MOD LPAREN term COMMA term RPAREN
 
-def p_term_clpr(p):
-    'term : LBRACE clpr RBRACE'
-    p[0] = Tree('term', [make_token(p, 1), p[2], make_token(p, 3)])
+            | EQU LPAREN term COMMA term RPAREN
+            | NEQU LPAREN term COMMA term RPAREN
+            | EQ LPAREN term COMMA term RPAREN
+            | NEQ LPAREN term COMMA term RPAREN
+            | UNIV LPAREN term COMMA term RPAREN
+            | IS LPAREN term COMMA term RPAREN
 
-def p_args_single(p):
-    'args : term'
-    p[0] = Tree('args', [p[1]])
-def p_args_term(p):
-    'args : args COMMA term'
-    p[0] = p[1]
-    p[0].append(make_token(p, 2))
-    p[0].append(p[3])
+            | EQA LPAREN term COMMA term RPAREN
+            | NEQA LPAREN term COMMA term RPAREN
+            | LT LPAREN term COMMA term RPAREN
+            | LE LPAREN term COMMA term RPAREN
+            | GT LPAREN term COMMA term RPAREN
+            | GE LPAREN term COMMA term RPAREN
+
+            | LTL LPAREN term COMMA term RPAREN
+            | LEL LPAREN term COMMA term RPAREN
+            | GTL LPAREN term COMMA term RPAREN
+            | GEL LPAREN term COMMA term RPAREN
+
+            | PIPE LPAREN term COMMA term RPAREN
+            | THROUGH LPAREN term COMMA term RPAREN
+            | IN LPAREN term COMMA term RPAREN
+            | INS LPAREN term COMMA term RPAREN
+            | EQFD LPAREN term COMMA term RPAREN
+            | NEQFD LPAREN term COMMA term RPAREN
+            | LTFD LPAREN term COMMA term RPAREN
+            | LEFD LPAREN term COMMA term RPAREN
+            | GTFD LPAREN term COMMA term RPAREN
+            | GEFD LPAREN term COMMA term RPAREN'''
+    start, end = token_start(p, 1), token_end(p, 6)
+    p[0] = make_tree('binop', [p[3], make_token(p, 1), p[5]], start, end)
 
+# unary operators
+def p_term_operator_unary(p):
+    '''term : NOT term
+            | MINUS term
+            | PLUS term'''
+    # shift/reduce conflict for MINUS and PLUS with p_term_operator_prefix above:
+    # ply prefers shifting and will resolve +(2,2) to the binary expression “2+2”
+    # instead of the unary “+ (2,2)” (this behavior is what we want)
+    p[0] = make_tree('unop', [make_token(p, 1), p[2]], token_start(p, 1), p[2].end)
+
+# lists
+def p_term_list(p):
+    'term : list'
+    p[0] = p[1]
 def p_list_empty(p):
     'list : LBRACKET RBRACKET'
-    p[0] = Tree('list', [make_token(p, 1), make_token(p, 2)])
+    start, end = token_start(p, 1), token_end(p, 2)
+    p[0] = make_tree('literal', [Token(type='NIL', val='[]', pos=start)], start, end)
 def p_list(p):
-    'list : LBRACKET args RBRACKET'
-    p[0] = Tree('list', [make_token(p, 1)] + list(p[2]) + [make_token(p, 3)])
-def p_list_tail(p):
-    'list : LBRACKET args PIPE term RBRACKET'
-    p[0] = Tree('list', [make_token(p, 1)] + list(p[2]) + [make_token(p, 3), p[4], make_token(p, 5)])
+    'list : LBRACKET elems RBRACKET'
+    p[0] = p[2]
+    p[0].start = token_start(p, 1)
+    p[0].end = token_end(p, 3)
 
-def p_functor(p):
-    'functor : NAME'
-    p[0] = Tree('functor', [make_token(p, 1)])
-
-# CLP(R) syntax
-def p_clpr_single(p):
-    'clpr : clpr_constr'
-    p[0] = Tree('clpr', [p[1]])
-def p_clpr_more(p):
-    '''clpr : clpr_constr COMMA clpr
-            | clpr_constr SEMI clpr'''
-    p[0] = Tree('clpr', [p[1], make_token(p, 2), p[3]])
-# XXX temporary until the new parser is in place, this also covers { } notation for DCGs
-def p_clpr_constr(p):
-    'clpr_constr : term'
-    p[0] = p[1]
+# list elements
+def p_elems_single(p):
+    'elems : term'
+    if p[1].label() == 'binop' and p[1][1].type == 'PIPE':
+        # p[1] has three children: left, "|", right
+        left = p[1].pop(0)
+        right = p[1].pop(1)
+        p[0] = make_tree('list', [left, right], p[1].start, right.end)
+    else:
+        # p[1] has one child: term
+        start, end = p[1].start, p[1].end
+        nil = make_tree('literal', [Token(type='NIL', val='[]', pos=start)], start, end)
+        p[0] = make_tree('list', [p[1], nil], start, end)
+def p_elems_multiple(p):
+    'elems : term COMMA elems'
+    p[0] = make_tree('list', [p[1], p[3]], p[1].start, p[3].end)
 
 def p_error(t):
-    if t is None:
-        raise SyntaxError('unexpected end of file')
+    if t is not None:
+        raise SyntaxError('at position {}: unexpected ‘{}’'.format(t.lexpos, t.value))
     else:
-        raise SyntaxError('{}: unexpected {}'.format(t.lexpos, t.value))
+        raise SyntaxError('unexpected EOF')
 
-parser = yacc.yacc(debug=False)
+parser = yacc.yacc(debug=True)
 
 if __name__ == '__main__':
     from .util import stringify
@@ -224,4 +287,9 @@ if __name__ == '__main__':
         if not s:
             continue
         ast = parser.parse(s)
+        def pp(node):
+            if isinstance(node, ParentedTree):
+                return '(' + node.label() + ' ' + ' '.join([pp(child) for child in node]) + ')'
+            return '"' + str(node) + '"'
+        print(pp(ast))
         print(stringify(ast))
diff --git a/prolog/util.py b/prolog/util.py
index 402b920..d5c5050 100644
--- a/prolog/util.py
+++ b/prolog/util.py
@@ -1,5 +1,5 @@
 # CodeQ: an online programming tutor.
-# Copyright (C) 2015 UL FRI
+# Copyright (C) 2015-2017 UL FRI
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
@@ -75,9 +75,34 @@ def stringify(obj):
         if obj.type in operators.values():
             return ' ' + str(obj) + ' '
         return str(obj)
+
+    if isinstance(obj, Tree):
+        label = obj.label()
+        children = [stringify(child) for child in obj]
+        if label == 'text':
+            return ' '.join(children)
+        elif label == 'clause':
+            if len(children) == 1:
+                return children[0] + '.'
+            elif len(children) == 2:
+                return children[0] + ' :- ' + children[1] + '.'
+        elif label == 'directive':
+            return ':- ' + children[0] + '.'
+        elif label == 'or':
+            return ' ; '.join(children)
+        elif label == 'if':
+            return ' -> '.join(children)
+        elif label == 'and':
+            return ', '.join(children)
+        elif label == 'compound':
+            return children[0] + '(' + (children[1] if len(children) > 1 else '') + ')'
+        elif label == 'args':
+            return ','.join(children)
+        elif label == 'list':
+            # TODO pretty print
+            return '[' + '|'.join(children) + ']'
+
     if isinstance(obj, Iterable):
-        if isinstance(obj, Tree) and obj.label() == 'clause':
-            return ''.join([stringify(child) for child in obj]) + '\n'
         return ''.join([stringify(child) for child in obj])
 
 # Return a canonical name for the [n]th variable in scope.
author	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2017-02-23 16:06:46 +0100
committer	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2017-02-24 12:02:38 +0100
commit	5ce873e545ecd77710c63dfa3f3bbb3d41fe3aa0 (patch)
tree	7914a886977a6a0254c60bd5e0273c0787baf748
parent	17d335a49f92b8eb45cf366aa235536aeefb5014 (diff)