prolog/lexer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

#!/usr/bin/python3

import ply.lex as lex

# LEXER

#states = (
#    ('comment', 'exclusive'),
#)

# tokens; treat operators as names if followed by (
operators = {
    r':-':  'FROM',
    r'->':  'IMPLIES',
    r'\+':  'NOT',
    r'not': 'NOT',
    r'=':   'EQU',
    r'\=':  'NEQU',
    r'==':  'EQ',
    r'\==': 'NEQ',
    r'=..': 'UNIV',
    r'is':  'IS',
    r'=:=': 'EQA',
    r'=\=': 'NEQA',
    r'<':   'LT',
    r'=<':  'LE',
    r'>':   'GT',
    r'>=':  'GE',
    r'@<':  'LTL',
    r'@=<': 'LEL',
    r'@>':  'GTL',
    r'@>=': 'GEL',
    r'+':   'PLUS',
    r'-':   'MINUS',
    r'*':   'STAR',
    r'/':   'DIV',
    r'//':  'IDIV',
    r'mod': 'MOD',
    r'**':  'POW',
    r'.':   'PERIOD',
    r',':   'COMMA',
    r';':   'SEMI'
}
tokens = list(operators.values()) + [
    'UINTEGER', 'UREAL',
    'NAME', 'VARIABLE', 'STRING',
    'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'PIPE', 'LBRACE', 'RBRACE',
    'INVALID'
]

# punctuation
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_PIPE = r'\|'
t_LBRACE = r'{'
t_RBRACE = r'}'

t_UINTEGER = r'[0-9]+'
t_UREAL    = r'[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?|inf|nan'
t_VARIABLE = r'(_|[A-Z])[a-zA-Z0-9_]*'
t_STRING   = r'"(""|\\.|[^\"])*"'

# no support for nested comments yet
def t_comment(t):
    r'(/\*(.|\n)*?\*/)|(%.*)'
    pass

def t_NAME(t):
    r"'(''|\\.|[^\\'])*'|[a-z][a-zA-Z0-9_]*|[-+*/\\^<>=~:.?@#$&]+|!|;|,"
    if t.value == ',' or \
       t.lexer.lexpos >= len(t.lexer.lexdata) or t.lexer.lexdata[t.lexer.lexpos] != '(':
        t.type = operators.get(t.value, 'NAME')
    return t

t_ignore  = ' \t'

def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)

def t_error(t):
    # TODO send this to stderr
    #print("Illegal character '" + t.value[0] + "'")
    t.type = 'INVALID'
    t.value = t.value[0]
    t.lexer.skip(1)
    return t

lexer = lex.lex(errorlog=lex.NullLogger())

if __name__ == '__main__':
    while True:
        try:
            s = input('> ')
        except EOFError:
            break
        if not s:
            continue

        lexer.input(s)
        tokens = list(lexer)
        print(tokens)