Keep token positions when lexing

This will allow us to match line edits to original source locations.
author: Timotej Lazar <timotej.lazar@araneo.org> 2014-11-17 12:32:19 +0100
committer: Aleš Smodiš <aless@guru.si> 2015-08-11 14:26:00 +0200
commit: 770f1ed8f7aeeb52db483dd72a5d4712839f9760 (patch)
tree: b56f82f43ac9fbad716b4092b96029a406fc5dc6 /prolog/util.py
parent: a4f46cfe3e2c8b1307df396c6c8c37b4f61a59bd (diff)
1 files changed, 31 insertions, 31 deletions
diff --git a/prolog/util.py b/prolog/util.py
index b7805db..b7e536b 100644
--- a/prolog/util.py
+++ b/prolog/util.py
@@ -5,13 +5,11 @@ import math
 import re
 
 from .lexer import lexer
+from util import Token
 
-# new lexer stuff
 def tokenize(text):
-    # feed the troll
     lexer.input(text)
-    # we are not interested in line numbers and absolute positions
-    return [(t.type, t.value) for t in lexer]
+    return [Token(t.type, t.value, t.lexpos) for t in lexer]
 
 operators = set([
     'FROM', 'IMPLIES', 'NOT',
@@ -22,11 +20,11 @@ operators = set([
 ])
 def stringify(tokens):
     def token_str(t):
-        if t[0] in ('PERIOD', 'COMMA'):
-            return t[1] + ' '
-        elif t[0] in operators:
-            return ' ' + t[1] + ' '
-        return t[1]
+        if t.type in ('PERIOD', 'COMMA'):
+            return str(t) + ' '
+        if t.type in operators:
+            return ' ' + str(t) + ' '
+        return str(t)
     return ''.join(map(token_str, tokens))
 
 # return a list of lines in 'code', and a list of rule indexes
@@ -34,35 +32,35 @@ def decompose(code):
     lines = []
     rules = []
     tokens = tokenize(code)
-    tokens.append(('EOF', ''))
+    tokens.append(Token('EOF'))
 
     line = []
     parens = []
     rule_start = 0
     for t in tokens:
-        if t[0] == 'SEMI':
+        if t.type == 'SEMI':
             if line != []:
                 lines.append(tuple(line))
                 line = []
             lines.append((t,))
             continue
         if not parens:
-            if t[0] in ('PERIOD', 'FROM', 'COMMA', 'EOF'):
+            if t.type in ('PERIOD', 'FROM', 'COMMA', 'EOF'):
                 if line != []:
                     lines.append(tuple(line))
                     line = []
-                if t[0] in ('PERIOD', 'EOF') and rule_start < len(lines):
+                if t.type in ('PERIOD', 'EOF') and rule_start < len(lines):
                     rules.append((rule_start, len(lines)))
                     rule_start = len(lines)
                 continue
-        if t[0] in ('LPAREN', 'LBRACKET', 'LBRACE'):
-            parens.append(t[0])
+        if t.type in ('LPAREN', 'LBRACKET', 'LBRACE'):
+            parens.append(t.type)
         elif parens:
-            if t[0] == 'RPAREN' and parens[-1] == 'LPAREN':
+            if t.type == 'RPAREN' and parens[-1] == 'LPAREN':
                 parens.pop()
-            elif t[0] == 'RBRACKET' and parens[-1] == 'LBRACKET':
+            elif t.type == 'RBRACKET' and parens[-1] == 'LBRACKET':
                 parens.pop()
-            elif t[0] == 'RBRACE' and parens[-1] == 'LBRACE':
+            elif t.type == 'RBRACE' and parens[-1] == 'LBRACE':
                 parens.pop()
         line.append(t)
     return tuple(lines), tuple(rules)
@@ -81,7 +79,7 @@ def compose(lines, rules):
             elif i == start:
                 code += ' :-\n'
             else:
-                if line and line[-1][0] != 'SEMI' and lines[i+1][-1][0] != 'SEMI':
+                if line and line[-1].type != 'SEMI' and lines[i+1][-1].type != 'SEMI':
                     code += ','
                 code += '\n'
     return code.strip()
@@ -92,18 +90,18 @@ def rename_vars(tokens, names={}):
     names = {k: v for k, v in names.items()}
     next_id = len(names)
     for i in range(len(tokens)):
-        if tokens[i][0] == 'PERIOD':
+        if tokens[i].type == 'PERIOD':
             names.clear()
             next_id = 0
-        elif tokens[i] == ('VARIABLE', '_'):
-            tokens[i] = ('VARIABLE', 'A' + str(next_id))
+        elif tokens[i] == Token('VARIABLE', '_'):
+            tokens[i] = Token('VARIABLE', 'A' + str(next_id))
             next_id += 1
-        elif tokens[i][0] == 'VARIABLE':
-            cur_name = tokens[i][1]
+        elif tokens[i].type == 'VARIABLE':
+            cur_name = tokens[i].val
             if cur_name not in names:
                 names[cur_name] = next_id
                 next_id += 1
-            tokens[i] = ('VARIABLE', 'A' + str(names[cur_name]))
+            tokens[i] = Token('VARIABLE', 'A' + str(names[cur_name]))
     return names
 
 # transformation = before → after; applied on line which is part of rule
@@ -113,17 +111,17 @@ def map_vars(before, after, line, rule):
     mapping = {}
     new_index = 0
     for i in range(len(before)):
-        if line[i][0] == 'VARIABLE':
-            formal_name = before[i][1]
-            if line[i][1] != '_':
-                actual_name = line[i][1]
+        if line[i].type == 'VARIABLE':
+            formal_name = before[i].val
+            if line[i].val != '_':
+                actual_name = line[i].val
             else:
                 actual_name = 'New'+str(new_index)
                 new_index += 1
             mapping[formal_name] = actual_name
 
-    remaining_formal = [t[1] for t in after if t[0] == 'VARIABLE' and t[1] not in mapping.keys()]
-    remaining_actual = [t[1] for t in rule if t[0] == 'VARIABLE' and t[1] != '_' and t[1] not in mapping.values()]
+    remaining_formal = [t.val for t in after if t.type == 'VARIABLE' and t.val not in mapping.keys()]
+    remaining_actual = [t.val for t in rule if t.type == 'VARIABLE' and t.val != '_' and t.val not in mapping.values()]
 
     while len(remaining_actual) < len(remaining_formal):
         remaining_actual.append('New'+str(new_index))
@@ -136,6 +134,8 @@ def map_vars(before, after, line, rule):
 
 # Basic sanity check.
 if __name__ == '__main__':
+    print(compose(*decompose('dup([H|T], [H1|T1]) :- dup(T1, T2). ')))
+
     rule = tokenize('dup([H|T], [H1|T1]) :- dup(T1, T2). ')
     line = tokenize('dup([H|T], [H1|T1]) :-')
     before = tokenize("dup([A0|A1], [A2|A3])")
author	Timotej Lazar <timotej.lazar@araneo.org>	2014-11-17 12:32:19 +0100
committer	Aleš Smodiš <aless@guru.si>	2015-08-11 14:26:00 +0200
commit	770f1ed8f7aeeb52db483dd72a5d4712839f9760 (patch)
tree	b56f82f43ac9fbad716b4092b96029a406fc5dc6 /prolog/util.py
parent	a4f46cfe3e2c8b1307df396c6c8c37b4f61a59bd (diff)