diff options
Diffstat (limited to 'regex/asttokens/line_numbers.py')
-rw-r--r-- | regex/asttokens/line_numbers.py | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/regex/asttokens/line_numbers.py b/regex/asttokens/line_numbers.py new file mode 100644 index 0000000..b91b00f --- /dev/null +++ b/regex/asttokens/line_numbers.py @@ -0,0 +1,71 @@ +# Copyright 2016 Grist Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bisect +import re + +_line_start_re = re.compile(r'^', re.M) + +class LineNumbers(object): + """ + Class to convert between character offsets in a text string, and pairs (line, column) of 1-based + line and 0-based column numbers, as used by tokens and AST nodes. + + This class expects unicode for input and stores positions in unicode. But it supports + translating to and from utf8 offsets, which are used by ast parsing. + """ + def __init__(self, text): + # A list of character offsets of each line's first character. + self._line_offsets = [m.start(0) for m in _line_start_re.finditer(text)] + self._text = text + self._text_len = len(text) + self._utf8_offset_cache = {} # maps line num to list of char offset for each byte in line + + def from_utf8_col(self, line, utf8_column): + """ + Given a 1-based line number and 0-based utf8 column, returns a 0-based unicode column. + """ + offsets = self._utf8_offset_cache.get(line) + if offsets is None: + end_offset = self._line_offsets[line] if line < len(self._line_offsets) else self._text_len + line_text = self._text[self._line_offsets[line - 1] : end_offset] + + offsets = [i for i,c in enumerate(line_text) for byte in c.encode('utf8')] + offsets.append(len(line_text)) + self._utf8_offset_cache[line] = offsets + + return offsets[max(0, min(len(offsets), utf8_column))] + + def line_to_offset(self, line, column): + """ + Converts 1-based line number and 0-based column to 0-based character offset into text. + """ + line -= 1 + if line >= len(self._line_offsets): + return self._text_len + elif line < 0: + return 0 + else: + return min(self._line_offsets[line] + max(0, column), self._text_len) + + def offset_to_line(self, offset): + """ + Converts 0-based character offset to pair (line, col) of 1-based line and 0-based column + numbers. + """ + offset = max(0, min(self._text_len, offset)) + line_index = bisect.bisect_right(self._line_offsets, offset) - 1 + return (line_index + 1, offset - self._line_offsets[line_index]) + + |