Use a more general method for extracting edits

This is a large overhaul of monkey code. Before, only edits within individual lines were tracked, which required a Prolog-specific method for splitting a program into a list of lines for every rule. In this version, modifications can be tracked within arbitrary code ranges. Ranges to be tracked are determined by selecting "interesting" subtrees in the AST of the starting code version. The new method is simpler, less language-dependent and easier to extend. The downside is that a program must be syntactically correct before we can attempt to fix it (the previous approach could handle programs with syntax errors in some cases). This commit also integrates a call to monkey.fix in prolog_session.hint, by running it if no other hint is found.
author: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2015-11-18 13:20:00 +0100
committer: Timotej Lazar <timotej.lazar@fri.uni-lj.si> 2015-12-11 16:11:15 +0100
commit: fa39fe7bfedd0b2e615d369adb5b510ceb9b857f (patch)
tree: abf381b007d1816247cab534ac8e2152695ad596 /prolog
parent: dd723bd01634fa5ffc85402ea10947e472b257af (diff)
1 files changed, 89 insertions, 98 deletions
diff --git a/prolog/util.py b/prolog/util.py
index ba61da0..4b316d5 100644
--- a/prolog/util.py
+++ b/prolog/util.py
@@ -20,12 +20,12 @@ from nltk import Tree
 
 # Stores a token's type and value, and optionally the position of the first
 # character in the lexed stream.
-class Token(namedtuple('Token', ['type', 'val', 'pos', 'rule', 'part', 'stop'])):
+class Token(namedtuple('Token', ['type', 'val', 'pos'])):
     __slots__ = ()
 
     # Custom constructor to support default parameters.
-    def __new__(cls, type, val='', pos=None, rule=None, part=None, stop=False):
-        return super(Token, cls).__new__(cls, type, val, pos, rule, part, stop)
+    def __new__(cls, type, val='', pos=None):
+        return super(Token, cls).__new__(cls, type, val, pos)
 
     def __str__(self):
         return self.val
@@ -45,13 +45,10 @@ class Token(namedtuple('Token', ['type', 'val', 'pos', 'rule', 'part', 'stop']))
         return hash(self[1])
 
     # Return a copy of this token, possibly modifying some fields.
-    def clone(self, type=None, val=None, pos=None, rule=None, part=None, stop=None):
+    def clone(self, type=None, val=None, pos=None):
         return Token(self.type if type is None else type,
                      self.val if val is None else val,
-                     self.pos if pos is None else pos,
-                     self.rule if rule is None else rule,
-                     self.part if part is None else part,
-                     self.stop if stop is None else stop)
+                     self.pos if pos is None else pos)
 
 from .lexer import lexer, operators
 from .parser import parser
@@ -81,94 +78,6 @@ def stringify(obj):
             return ''.join([stringify(child) for child in obj]) + '\n'
         return ''.join([stringify(child) for child in obj])
 
-# Lex [code] into tokens with rule indexes and stop markers.
-def annotate(code):
-    rule = 0
-    part = 0
-    parens = []        # stack of currently open parens/brackets/braces
-    in_parens = 0      # COMMA means a new part if this is 0
-
-    token = None
-    lexer.input(code)
-    for t in lexer:
-        tok_rule = rule
-        tok_part = part
-        tok_stop = True
-
-        if t.type == 'PERIOD':  # .
-            rule += 1
-            part = 0
-            in_parens = 0
-            parens = []
-        elif t.type in ('FROM', 'SEMI'):  # :- ;
-            part += 1
-        elif t.type == 'COMMA':  # ,
-            if not parens or in_parens == 0:
-                part += 1
-            else:
-                tok_stop = False
-
-        # Handle left parens.
-        elif t.type == 'LPAREN':  # (
-            if token and token.type == 'NAME':  # name(
-                tok_stop = False
-                parens.append('COMPOUND')
-                in_parens += 1
-            else:
-                parens.append(t.type)  # …, (
-        elif t.type == 'LBRACKET':  # [
-            tok_stop = False
-            parens.append(t.type)
-            in_parens += 1
-        elif t.type == 'LBRACE':  # {
-            parens.append(t.type)
-
-        # Handle right parens.
-        elif t.type == 'RPAREN':  # )
-            if parens:
-                if parens[-1] == 'COMPOUND':  # name(…)
-                    tok_stop = False
-                    parens.pop()
-                    in_parens -= 1
-                elif parens[-1] == 'LPAREN':  # (…)
-                    parens.pop()
-        elif t.type == 'RBRACKET':  # ]
-            if parens and parens[-1] == 'LBRACKET':  # […]
-                tok_stop = False
-                parens.pop()
-                in_parens -= 1
-        elif t.type == 'RBRACE':  # }
-            if parens and parens[-1] == 'LBRACE':  # {…}
-                parens.pop()
-
-        # Normal tokens.
-        else:
-            tok_stop = False
-
-        token = Token(t.type, t.value, t.lexpos, tok_rule, tok_part, tok_stop)
-        yield token
-
-# Format a list of annotated [tokens].
-def compose(tokens):
-    code = ''
-    prev = None
-    for t in tokens:
-        if t.type == 'SEMI':
-            code += '\n  '
-        if prev and (prev.part != t.part or prev.rule != t.rule):
-            code += '\n'
-            if t.part:
-                code += '  '
-
-        if t.type in ('PERIOD', 'COMMA'):
-            code += t.val + ' '
-        elif t.type in operators.values():
-            code += ' ' + t.val + ' '
-        else:
-            code += t.val
-        prev = t
-    return code.strip()
-
 # Rename variables in [tokens] to A0, A1, A2,… in order of appearance.
 def rename_vars(tokens, names=None):
     if names is None:
@@ -179,8 +88,9 @@ def rename_vars(tokens, names=None):
     tokens = list(tokens)
     for i, t in enumerate(tokens):
         if t.type == 'PERIOD':
-            names.clear()
-            next_id = 0
+            pass
+#            names.clear()
+#            next_id = 0
         elif t.type == 'VARIABLE':
             if t.val.startswith('_'):
                 tokens[i] = t.clone(val='A{}'.format(next_id))
@@ -193,6 +103,87 @@ def rename_vars(tokens, names=None):
                 tokens[i] = t.clone(val=names[cur_name])
     return tokens
 
+# Rename variables in [tokens] to A0, A1, A2,… in order of appearance.
+def rename_vars_list(tokens, names=None):
+    if names is None:
+        names = {}
+    next_id = len(names)
+
+    # Return a new list.
+    tokens = list(tokens)
+    for i, t in enumerate(tokens):
+        if t.type == 'VARIABLE':
+            if t.val.startswith('_'):
+                tokens[i] = t.clone(val='A{}'.format(next_id))
+                next_id += 1
+            else:
+                cur_name = t.val
+                if cur_name not in names:
+                    names[cur_name] = 'A{}'.format(next_id)
+                    next_id += 1
+                tokens[i] = t.clone(val=names[cur_name])
+    return tokens
+
+# Rename variables in AST rooted at [root] to A0, A1, A2,… in order of
+# appearance.
+def rename_vars_ast2(root, fixed_names=None):
+    if fixed_names is None:
+        fixed_names = {}
+    names = {}
+    next_id = len(fixed_names) + len(names)
+
+    def rename_aux(node):
+        nonlocal fixed_names, names, next_id
+        if isinstance(node, Tree):
+            if node.label() == 'clause':
+                names = {}
+                next_id = len(fixed_names) + len(names)
+            new_children = [rename_aux(child) for child in node]
+            new_node = Tree(node.label(), new_children)
+        elif isinstance(node, Token):
+            if node.type == 'VARIABLE':
+                token = node
+                if token.val.startswith('_'):
+                    new_node = token.clone(val='A{}'.format(next_id))
+                    next_id += 1
+                else:
+                    cur_name = token.val
+                    if cur_name in fixed_names:
+                        new_name = fixed_names[cur_name]
+                    else:
+                        if cur_name not in names:
+                            names[cur_name] = 'A{}'.format(next_id)
+                            next_id += 1
+                        new_name = names[cur_name]
+                    new_node = token.clone(val=new_name)
+            else:
+                new_node = node
+        return new_node
+    return rename_aux(root)
+
+# Yield "interesting" parts of a Prolog AST as lists of tokens.
+def interesting_ranges(ast, path=()):
+    if ast.label() in {'clause', 'head', 'or', 'if', 'and'}:
+        if ast.label() == 'and':
+            for i in range(0, len(ast), 2):
+                for j in range(i, len(ast), 2):
+                    subs = ast[i:j+1]
+                    terminals = []
+                    for s in subs:
+                        terminals.extend([s] if isinstance(s, Token) else s.leaves())
+                    # We want at least some context.
+                    if len(terminals) > 1:
+                        yield terminals, path + (ast.label(),)
+        else:
+            terminals = ast.leaves()
+            # We want at least some context.
+            if len(terminals) > 1:
+                yield terminals, path + (ast.label(),)
+
+    for subtree in ast:
+        if isinstance(subtree, Tree):
+            yield from interesting_ranges(subtree, path + (ast.label(),))
+
 # Helper function to remove trailing punctuation from lines and rename
 # variables to A1,A2,A3,… (potentially using [var_names]). Return a tuple.
 def normalized(line, var_names=None):
author	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2015-11-18 13:20:00 +0100
committer	Timotej Lazar <timotej.lazar@fri.uni-lj.si>	2015-12-11 16:11:15 +0100
commit	fa39fe7bfedd0b2e615d369adb5b510ceb9b857f (patch)
tree	abf381b007d1816247cab534ac8e2152695ad596 /prolog
parent	dd723bd01634fa5ffc85402ea10947e472b257af (diff)