From 770f1ed8f7aeeb52db483dd72a5d4712839f9760 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Mon, 17 Nov 2014 12:32:19 +0100 Subject: Keep token positions when lexing This will allow us to match line edits to original source locations. --- prolog/engine.py | 2 +- prolog/util.py | 62 ++++++++++++++++++++++++++++---------------------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/prolog/engine.py b/prolog/engine.py index 6087bbc..299d2eb 100755 --- a/prolog/engine.py +++ b/prolog/engine.py @@ -139,7 +139,7 @@ class PrologEngine(object): try: start = 0 for idx in range(len(tokens)): - if tokens[idx] != ('PERIOD', '.') or idx - start <= 1: + if tokens[idx].type != 'PERIOD' or idx - start <= 1: continue rule = stringify(tokens[start:idx]) orig_rule = rule diff --git a/prolog/util.py b/prolog/util.py index b7805db..b7e536b 100644 --- a/prolog/util.py +++ b/prolog/util.py @@ -5,13 +5,11 @@ import math import re from .lexer import lexer +from util import Token -# new lexer stuff def tokenize(text): - # feed the troll lexer.input(text) - # we are not interested in line numbers and absolute positions - return [(t.type, t.value) for t in lexer] + return [Token(t.type, t.value, t.lexpos) for t in lexer] operators = set([ 'FROM', 'IMPLIES', 'NOT', @@ -22,11 +20,11 @@ operators = set([ ]) def stringify(tokens): def token_str(t): - if t[0] in ('PERIOD', 'COMMA'): - return t[1] + ' ' - elif t[0] in operators: - return ' ' + t[1] + ' ' - return t[1] + if t.type in ('PERIOD', 'COMMA'): + return str(t) + ' ' + if t.type in operators: + return ' ' + str(t) + ' ' + return str(t) return ''.join(map(token_str, tokens)) # return a list of lines in 'code', and a list of rule indexes @@ -34,35 +32,35 @@ def decompose(code): lines = [] rules = [] tokens = tokenize(code) - tokens.append(('EOF', '')) + tokens.append(Token('EOF')) line = [] parens = [] rule_start = 0 for t in tokens: - if t[0] == 'SEMI': + if t.type == 'SEMI': if line != []: lines.append(tuple(line)) line = [] lines.append((t,)) continue if not parens: - if t[0] in ('PERIOD', 'FROM', 'COMMA', 'EOF'): + if t.type in ('PERIOD', 'FROM', 'COMMA', 'EOF'): if line != []: lines.append(tuple(line)) line = [] - if t[0] in ('PERIOD', 'EOF') and rule_start < len(lines): + if t.type in ('PERIOD', 'EOF') and rule_start < len(lines): rules.append((rule_start, len(lines))) rule_start = len(lines) continue - if t[0] in ('LPAREN', 'LBRACKET', 'LBRACE'): - parens.append(t[0]) + if t.type in ('LPAREN', 'LBRACKET', 'LBRACE'): + parens.append(t.type) elif parens: - if t[0] == 'RPAREN' and parens[-1] == 'LPAREN': + if t.type == 'RPAREN' and parens[-1] == 'LPAREN': parens.pop() - elif t[0] == 'RBRACKET' and parens[-1] == 'LBRACKET': + elif t.type == 'RBRACKET' and parens[-1] == 'LBRACKET': parens.pop() - elif t[0] == 'RBRACE' and parens[-1] == 'LBRACE': + elif t.type == 'RBRACE' and parens[-1] == 'LBRACE': parens.pop() line.append(t) return tuple(lines), tuple(rules) @@ -81,7 +79,7 @@ def compose(lines, rules): elif i == start: code += ' :-\n' else: - if line and line[-1][0] != 'SEMI' and lines[i+1][-1][0] != 'SEMI': + if line and line[-1].type != 'SEMI' and lines[i+1][-1].type != 'SEMI': code += ',' code += '\n' return code.strip() @@ -92,18 +90,18 @@ def rename_vars(tokens, names={}): names = {k: v for k, v in names.items()} next_id = len(names) for i in range(len(tokens)): - if tokens[i][0] == 'PERIOD': + if tokens[i].type == 'PERIOD': names.clear() next_id = 0 - elif tokens[i] == ('VARIABLE', '_'): - tokens[i] = ('VARIABLE', 'A' + str(next_id)) + elif tokens[i] == Token('VARIABLE', '_'): + tokens[i] = Token('VARIABLE', 'A' + str(next_id)) next_id += 1 - elif tokens[i][0] == 'VARIABLE': - cur_name = tokens[i][1] + elif tokens[i].type == 'VARIABLE': + cur_name = tokens[i].val if cur_name not in names: names[cur_name] = next_id next_id += 1 - tokens[i] = ('VARIABLE', 'A' + str(names[cur_name])) + tokens[i] = Token('VARIABLE', 'A' + str(names[cur_name])) return names # transformation = before → after; applied on line which is part of rule @@ -113,17 +111,17 @@ def map_vars(before, after, line, rule): mapping = {} new_index = 0 for i in range(len(before)): - if line[i][0] == 'VARIABLE': - formal_name = before[i][1] - if line[i][1] != '_': - actual_name = line[i][1] + if line[i].type == 'VARIABLE': + formal_name = before[i].val + if line[i].val != '_': + actual_name = line[i].val else: actual_name = 'New'+str(new_index) new_index += 1 mapping[formal_name] = actual_name - remaining_formal = [t[1] for t in after if t[0] == 'VARIABLE' and t[1] not in mapping.keys()] - remaining_actual = [t[1] for t in rule if t[0] == 'VARIABLE' and t[1] != '_' and t[1] not in mapping.values()] + remaining_formal = [t.val for t in after if t.type == 'VARIABLE' and t.val not in mapping.keys()] + remaining_actual = [t.val for t in rule if t.type == 'VARIABLE' and t.val != '_' and t.val not in mapping.values()] while len(remaining_actual) < len(remaining_formal): remaining_actual.append('New'+str(new_index)) @@ -136,6 +134,8 @@ def map_vars(before, after, line, rule): # Basic sanity check. if __name__ == '__main__': + print(compose(*decompose('dup([H|T], [H1|T1]) :- dup(T1, T2). '))) + rule = tokenize('dup([H|T], [H1|T1]) :- dup(T1, T2). ') line = tokenize('dup([H|T], [H1|T1]) :-') before = tokenize("dup([A0|A1], [A2|A3])") -- cgit v1.2.1