1 files changed, 275 insertions, 0 deletions
diff --git a/regex/asttokens/mark_tokens.py b/regex/asttokens/mark_tokens.py
new file mode 100644
index 0000000..f48fb77
--- /dev/null
+++ b/regex/asttokens/mark_tokens.py
@@ -0,0 +1,275 @@
+# Copyright 2016 Grist Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import numbers
+import token
+from . import util
+
+
+# Mapping of matching braces. To find a token here, look up token[:2].
+_matching_pairs_left = {
+  (token.OP, '('): (token.OP, ')'),
+  (token.OP, '['): (token.OP, ']'),
+  (token.OP, '{'): (token.OP, '}'),
+}
+
+_matching_pairs_right = {
+  (token.OP, ')'): (token.OP, '('),
+  (token.OP, ']'): (token.OP, '['),
+  (token.OP, '}'): (token.OP, '{'),
+}
+
+
+class MarkTokens(object):
+  """
+  Helper that visits all nodes in the AST tree and assigns .first_token and .last_token attributes
+  to each of them. This is the heart of the token-marking logic.
+  """
+  def __init__(self, code):
+    self._code = code
+    self._methods = util.NodeMethods()
+    self._iter_children = None
+
+  def visit_tree(self, node):
+    self._iter_children = util.iter_children_func(node)
+    util.visit_tree(node, self._visit_before_children, self._visit_after_children)
+
+  def _visit_before_children(self, node, parent_token):
+    col = getattr(node, 'col_offset', None)
+    token = self._code.get_token_from_utf8(node.lineno, col) if col is not None else None
+
+    if not token and util.is_module(node):
+      # We'll assume that a Module node starts at the start of the source code.
+      token = self._code.get_token(1, 0)
+
+    # Use our own token, or our parent's if we don't have one, to pass to child calls as
+    # parent_token argument. The second value becomes the token argument of _visit_after_children.
+    return (token or parent_token, token)
+
+  def _visit_after_children(self, node, parent_token, token):
+    # This processes the node generically first, after all children have been processed.
+
+    # Get the first and last tokens that belong to children. Note how this doesn't assume that we
+    # iterate through children in order that corresponds to occurrence in source code. This
+    # assumption can fail (e.g. with return annotations).
+    first = token
+    last = None
+    for child in self._iter_children(node):
+      if not first or child.first_token.index < first.index:
+        first = child.first_token
+      if not last or child.last_token.index > last.index:
+        last = child.last_token
+
+    # If we don't have a first token from _visit_before_children, and there were no children, then
+    # use the parent's token as the first token.
+    first = first or parent_token
+
+    # If no children, set last token to the first one.
+    last = last or first
+
+    # Statements continue to before NEWLINE. This helps cover a few different cases at once.
+    if util.is_stmt(node):
+      last = self._find_last_in_line(last)
+
+    # Capture any unmatched brackets.
+    first, last = self._expand_to_matching_pairs(first, last, node)
+
+    # Give a chance to node-specific methods to adjust.
+    nfirst, nlast = self._methods.get(self, node.__class__)(node, first, last)
+
+    if (nfirst, nlast) != (first, last):
+      # If anything changed, expand again to capture any unmatched brackets.
+      nfirst, nlast = self._expand_to_matching_pairs(nfirst, nlast, node)
+
+    node.first_token = nfirst
+    node.last_token = nlast
+
+  def _find_last_in_line(self, start_token):
+    try:
+      newline = self._code.find_token(start_token, token.NEWLINE)
+    except IndexError:
+      newline = self._code.find_token(start_token, token.ENDMARKER)
+    return self._code.prev_token(newline)
+
+  def _iter_non_child_tokens(self, first_token, last_token, node):
+    """
+    Generates all tokens in [first_token, last_token] range that do not belong to any children of
+    node. E.g. `foo(bar)` has children `foo` and `bar`, but we would yield the `(`.
+    """
+    tok = first_token
+    for n in self._iter_children(node):
+      for t in self._code.token_range(tok, self._code.prev_token(n.first_token)):
+        yield t
+      if n.last_token.index >= last_token.index:
+        return
+      tok = self._code.next_token(n.last_token)
+
+    for t in self._code.token_range(tok, last_token):
+      yield t
+
+  def _expand_to_matching_pairs(self, first_token, last_token, node):
+    """
+    Scan tokens in [first_token, last_token] range that are between node's children, and for any
+    unmatched brackets, adjust first/last tokens to include the closing pair.
+    """
+    # We look for opening parens/braces among non-child tokens (i.e. tokens between our actual
+    # child nodes). If we find any closing ones, we match them to the opens.
+    to_match_right = []
+    to_match_left = []
+    for tok in self._iter_non_child_tokens(first_token, last_token, node):
+      tok_info = tok[:2]
+      if to_match_right and tok_info == to_match_right[-1]:
+        to_match_right.pop()
+      elif tok_info in _matching_pairs_left:
+        to_match_right.append(_matching_pairs_left[tok_info])
+      elif tok_info in _matching_pairs_right:
+        to_match_left.append(_matching_pairs_right[tok_info])
+
+    # Once done, extend `last_token` to match any unclosed parens/braces.
+    for match in reversed(to_match_right):
+      last = self._code.next_token(last_token)
+      # Allow for a trailing comma before the closing delimiter.
+      if util.match_token(last, token.OP, ','):
+        last = self._code.next_token(last)
+      # Now check for the actual closing delimiter.
+      if util.match_token(last, *match):
+        last_token = last
+
+    # And extend `first_token` to match any unclosed opening parens/braces.
+    for match in to_match_left:
+      first = self._code.prev_token(first_token)
+      if util.match_token(first, *match):
+        first_token = first
+
+    return (first_token, last_token)
+
+  #----------------------------------------------------------------------
+  # Node visitors. Each takes a preliminary first and last tokens, and returns the adjusted pair
+  # that will actually be assigned.
+
+  def visit_default(self, node, first_token, last_token):
+    # pylint: disable=no-self-use
+    # By default, we don't need to adjust the token we computed earlier.
+    return (first_token, last_token)
+
+  def handle_comp(self, open_brace, node, first_token, last_token):
+    # For list/set/dict comprehensions, we only get the token of the first child, so adjust it to
+    # include the opening brace (the closing brace will be matched automatically).
+    before = self._code.prev_token(first_token)
+    util.expect_token(before, token.OP, open_brace)
+    return (before, last_token)
+
+  def visit_listcomp(self, node, first_token, last_token):
+    return self.handle_comp('[', node, first_token, last_token)
+
+  if six.PY2:
+    # We shouldn't do this on PY3 because its SetComp/DictComp already have a correct start.
+    def visit_setcomp(self, node, first_token, last_token):
+      return self.handle_comp('{', node, first_token, last_token)
+
+    def visit_dictcomp(self, node, first_token, last_token):
+      return self.handle_comp('{', node, first_token, last_token)
+
+  def visit_comprehension(self, node, first_token, last_token):
+    # The 'comprehension' node starts with 'for' but we only get first child; we search backwards
+    # to find the 'for' keyword.
+    first = self._code.find_token(first_token, token.NAME, 'for', reverse=True)
+    return (first, last_token)
+
+  def handle_attr(self, node, first_token, last_token):
+    # Attribute node has ".attr" (2 tokens) after the last child.
+    dot = self._code.find_token(last_token, token.OP, '.')
+    name = self._code.next_token(dot)
+    util.expect_token(name, token.NAME)
+    return (first_token, name)
+
+  visit_attribute = handle_attr
+  visit_assignattr = handle_attr
+  visit_delattr = handle_attr
+
+  def handle_doc(self, node, first_token, last_token):
+    # With astroid, nodes that start with a doc-string can have an empty body, in which case we
+    # need to adjust the last token to include the doc string.
+    if not node.body and getattr(node, 'doc', None):
+      last_token = self._code.find_token(last_token, token.STRING)
+    return (first_token, last_token)
+
+  visit_classdef = handle_doc
+  visit_funcdef = handle_doc
+
+  def visit_call(self, node, first_token, last_token):
+    # A function call isn't over until we see a closing paren. Remember that last_token is at the
+    # end of all children, so we are not worried about encountering a paren that belongs to a
+    # child.
+    return (first_token, self._code.find_token(last_token, token.OP, ')'))
+
+  def visit_subscript(self, node, first_token, last_token):
+    # A subscript operations isn't over until we see a closing bracket. Similar to function calls.
+    return (first_token, self._code.find_token(last_token, token.OP, ']'))
+
+  def visit_tuple(self, node, first_token, last_token):
+    # A tuple doesn't include parens; if there is a trailing comma, make it part of the tuple.
+    try:
+      maybe_comma = self._code.next_token(last_token)
+      if util.match_token(maybe_comma, token.OP, ','):
+        last_token = maybe_comma
+    except IndexError:
+      pass
+    return (first_token, last_token)
+
+  def visit_str(self, node, first_token, last_token):
+    # Multiple adjacent STRING tokens form a single string.
+    last = self._code.next_token(last_token)
+    while util.match_token(last, token.STRING):
+      last_token = last
+      last = self._code.next_token(last_token)
+    return (first_token, last_token)
+
+  def visit_num(self, node, first_token, last_token):
+    # A constant like '-1' gets turned into two tokens; this will skip the '-'.
+    while util.match_token(last_token, token.OP):
+      last_token = self._code.next_token(last_token)
+    return (first_token, last_token)
+
+  # In Astroid, the Num and Str nodes are replaced by Const.
+  def visit_const(self, node, first_token, last_token):
+    if isinstance(node.value, numbers.Number):
+      return self.visit_num(node, first_token, last_token)
+    elif isinstance(node.value, six.string_types):
+      return self.visit_str(node, first_token, last_token)
+    return (first_token, last_token)
+
+  def visit_keyword(self, node, first_token, last_token):
+    if node.arg is not None:
+      equals = self._code.find_token(first_token, token.OP, '=', reverse=True)
+      name = self._code.prev_token(equals)
+      util.expect_token(name, token.NAME, node.arg)
+      first_token = name
+    return (first_token, last_token)
+
+  def visit_starred(self, node, first_token, last_token):
+    # Astroid has 'Starred' nodes (for "foo(*bar)" type args), but they need to be adjusted.
+    if not util.match_token(first_token, token.OP, '*'):
+      star = self._code.prev_token(first_token)
+      if util.match_token(star, token.OP, '*'):
+        first_token = star
+    return (first_token, last_token)
+
+  def visit_assignname(self, node, first_token, last_token):
+    # Astroid may turn 'except' clause into AssignName, but we need to adjust it.
+    if util.match_token(first_token, token.NAME, 'except'):
+      colon = self._code.find_token(last_token, token.OP, ':')
+      first_token = last_token = self._code.prev_token(colon)
+    return (first_token, last_token)