1 files changed, 71 insertions, 0 deletions
diff --git a/regex/asttokens/line_numbers.py b/regex/asttokens/line_numbers.py
new file mode 100644
index 0000000..b91b00f
--- /dev/null
+++ b/regex/asttokens/line_numbers.py
@@ -0,0 +1,71 @@
+# Copyright 2016 Grist Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+import re
+
+_line_start_re = re.compile(r'^', re.M)
+
+class LineNumbers(object):
+  """
+  Class to convert between character offsets in a text string, and pairs (line, column) of 1-based
+  line and 0-based column numbers, as used by tokens and AST nodes.
+
+  This class expects unicode for input and stores positions in unicode. But it supports
+  translating to and from utf8 offsets, which are used by ast parsing.
+  """
+  def __init__(self, text):
+    # A list of character offsets of each line's first character.
+    self._line_offsets = [m.start(0) for m in _line_start_re.finditer(text)]
+    self._text = text
+    self._text_len = len(text)
+    self._utf8_offset_cache = {}    # maps line num to list of char offset for each byte in line
+
+  def from_utf8_col(self, line, utf8_column):
+    """
+    Given a 1-based line number and 0-based utf8 column, returns a 0-based unicode column.
+    """
+    offsets = self._utf8_offset_cache.get(line)
+    if offsets is None:
+      end_offset = self._line_offsets[line] if line < len(self._line_offsets) else self._text_len
+      line_text = self._text[self._line_offsets[line - 1] : end_offset]
+
+      offsets = [i for i,c in enumerate(line_text) for byte in c.encode('utf8')]
+      offsets.append(len(line_text))
+      self._utf8_offset_cache[line] = offsets
+
+    return offsets[max(0, min(len(offsets), utf8_column))]
+
+  def line_to_offset(self, line, column):
+    """
+    Converts 1-based line number and 0-based column to 0-based character offset into text.
+    """
+    line -= 1
+    if line >= len(self._line_offsets):
+      return self._text_len
+    elif line < 0:
+      return 0
+    else:
+      return min(self._line_offsets[line] + max(0, column), self._text_len)
+
+  def offset_to_line(self, offset):
+    """
+    Converts 0-based character offset to pair (line, col) of 1-based line and 0-based column
+    numbers.
+    """
+    offset = max(0, min(self._text_len, offset))
+    line_index = bisect.bisect_right(self._line_offsets, offset) - 1
+    return (line_index + 1, offset - self._line_offsets[line_index])
+
+