Implemented a parser of simple pythonic definitions.

author: Aleš Smodiš <aless@guru.si> 2015-07-14 17:57:06 +0200
committer: Aleš Smodiš <aless@guru.si> 2015-07-14 17:57:06 +0200
commit: 13399da7f5eea223db86ff4d30403bc714c5816c (patch)
tree: f49dd400b20c3ff84931994d9d99e5f58cc2be0c /js
parent: 9a9c1113c90cf787a82312a548f2bf3776927d8e (diff)
2 files changed, 423 insertions, 3 deletions
diff --git a/js/codeq.js b/js/codeq.js
index 3a30823..770153e 100644
--- a/js/codeq.js
+++ b/js/codeq.js
@@ -614,7 +614,7 @@ window.siteDefinition = { logLevel: 'debug' }; // for debug purposes
         // --------------------------------------------------------------------------------
         // Task info parser: converts simplified pythonic syntax to a JavaScript function
         // --------------------------------------------------------------------------------
-
+        // deprecated: use codeq.parseDefinition() instead
         parseInfo: function (infoText) {
             var parts = [],
                 n, lines, line, i, j, len, walker, fn, obj;
@@ -700,7 +700,7 @@ window.siteDefinition = { logLevel: 'debug' }; // for debug purposes
 //        if (s.length == 0) return; // empty hash
 //        if (s.charAt(0) == '#') s = s.substring(1);
 //        if (s.length == 0) return; // empty hash
-        var editor = CodeMirror.fromTextArea(document.getElementById('program'), { cursorHeight: 0.85, lineNumbers: true, matchBrackets: true });
+        var editor = CodeMirror(document.getElementById('code_editor'), { cursorHeight: 0.85, lineNumbers: true, matchBrackets: true });
         editor.setValue('sister(X, Y) :-\n    female(X),\n    parent(Z, X),\n    parent(Z, Y),\n    X \\== Y.');
 
 /*        $('#console').terminal(function (command, term) {
@@ -734,7 +734,7 @@ window.siteDefinition = { logLevel: 'debug' }; // for debug purposes
             url: 'sister.py',
             callback: function (data, status, url) {
                 if (!data) return;
-                var info = codeq.system.parseInfo(data);
+                var info = codeq.parseDefinition(data);
                 $('#description').html(info.description);
             }
         });
diff --git a/js/def_parser.js b/js/def_parser.js
new file mode 100644
index 0000000..91ca35d
--- /dev/null
+++ b/js/def_parser.js
@@ -0,0 +1,420 @@
+/**
+ * A parser/compiler for the pythonic definitions of assignments.
+ * Supports basic assignment statements, no expressions using operators (yet).
+ */
+
+(function () {
+    var regexpWhitespace = new RegExp('[ \t]'),
+        regexpNameStart = new RegExp('[a-zA-Z_]'),
+        regexpName = new RegExp('[a-zA-Z0-9_]'),
+        regexpNumber = new RegExp('[0-9]'),
+        regexpNumberAndDot = new RegExp('[0-9.]');
+
+    var isEscape = function (s, pos) {
+        var i, result = false;
+        for (i = pos; i >= pos; i--) {
+            if (s.charAt(pos) === '\\') result = !result;
+            else break;
+        }
+        return result;
+    };
+
+    var escapePyString = function (s, output) {
+        var parts = s.split("'"),
+            n = parts.length,
+            i, part, previousPart;
+        output.push("'"); // starting quote
+
+        previousPart = parts[0];
+        output.push(previousPart.split('\r').join('\\r').split('\t').join('\\t').split('\n').join('\\n'));
+        for (i = 1; i < n; i++) {
+            part = parts[i];
+            // first escape the single quote, if required
+            if (!isEscape(previousPart, previousPart.length - 1)) output.push('\\');
+            // escape \r, \n, \t
+            output.push(part.split('\r').join('\\r').split('\t').join('\\t').split('\n').join('\\n'));
+            previousPart = part;
+        }
+
+        output.push("'"); // ending quote
+    };
+
+    var tokenize = function (input) {
+        var pos = 0, row = 1, col = 1,
+            n = input.length,
+            is_line_start = true,
+            getMultilineString = function (type) {
+                var p = pos, i, j, l, parts, part, is_escaped, result;
+                while (p < n) {
+                    i = input.indexOf(type, p);
+                    if (i < 0) throw new Error('Unterminated long string at position #' + pos + ', line ' + row + ', character ' + col);
+                    is_escaped = false;
+                    for (j = i-1; j >= pos; j--) {
+                        if (input.charAt(j) === '\\') is_escaped = !is_escaped;
+                        else break;
+                    }
+                    if (is_escaped) p = i + 1; // an escaped quote
+                    else {
+                        // process line continuations
+                        parts = input.slice(pos, i).split('\n');
+                        col += i + 3 - pos; // position the next column pointer
+                        pos = i + 3; // position the next character pointer
+                        l = parts.length - 1;
+                        if (l === 0) return parts[0]; // no newline character in the string
+                        row += l; // add the lines of the string
+                        col = 1 + parts[l].length;
+                        result = []; // the resulting string
+                        for (i = 0; i < l; i++) {
+                            part = parts[i]; // check the part for a trailing escape character
+                            is_escaped = false;
+                            for (j = part.length - 1; j >= 0; j--) {
+                                if (part.charAt(j) === '\\') is_escaped = !is_escaped;
+                                else break;
+                            }
+                            if (is_escaped) {
+                                // only add the line with the tail escape character trimmed
+                                result.push(part.slice(0, part.length - 1));
+                            }
+                            else {
+                                // add the line and the trailing newline
+                                result.push(part, '\n');
+                            }
+                        }
+                        result.push(parts[l]); // the last line, which is not checked for an escape character
+                        return result.join('');
+                    }
+                }
+                throw new Error('Unterminated long string at position #' + pos + ', line ' + row + ', character ' + col);
+            },
+            getString = function (type) {
+                var p = pos, i, j, is_escaped, s;
+                while (p < n) {
+                    i = input.indexOf(type, p);
+                    j = input.indexOf('\n', p)
+                    if (i < 0) throw new Error('Unterminated string at position #' + pos + ', line ' + row + ', character ' + col);
+                    if ((j >= 0) && (j < i)) throw new Error('Unterminated string at position #' + pos + ', line ' + row + ', character ' + col);
+                    is_escaped = false;
+                    for (j = i-1; j >= pos; j--) {
+                        if (input.charAt(j) === '\\') is_escaped = !is_escaped;
+                        else break;
+                    }
+                    if (is_escaped) p = i + 1; // an escaped quote
+                    else {
+                        s = input.slice(pos, i);
+                        col += i + 1 - pos; // position the next column pointer
+                        pos = i + 1; // position the next character pointer
+                        return s;
+                    }
+                }
+                throw new Error('Unterminated string at position #' + pos + ', line ' + row + ', character ' + col);
+            };
+
+        // the next() function
+        return function () {
+            var s, count = 0, token = null, parts, i;
+            if (pos >= n) return null;
+            var c = input.charAt(pos++), token_row = row, token_col = col;
+
+            // skip white-space
+            for (;;) {
+                if (c === ' ') {
+                    count++;
+                    col++;
+                }
+                else if (c === '\t') {
+                    // tab is worth 4 spaces
+                    count += 4;
+                    col++;
+                }
+                else if (c === '\n') {
+                    count = 0;
+                    is_line_start = true;
+                    row++;
+                    col = 1;
+                }
+                else if (c === '\r') {
+                    if (pos >= n) return null; // EOF
+                    pos++;
+                    if (input.charAt(pos) !== '\n') throw new Error("CR character without a trailing LF at character #" + (pos-1));
+                    count = 0;
+                    is_line_start = true;
+                    row++;
+                    col = 1;
+                }
+                else break;
+                if (pos >= n) return null; // EOF
+                c = input.charAt(pos++);
+            }
+
+            if ((c === '"') || (c === "'")) {
+                // a quote starts a string, now see whether it's a multi-line string
+                if ((pos+1 < n) && (input.charAt(pos) == c) && (input.charAt(pos+1) == c)) {
+                    // it's a long string
+                    pos += 2;
+                    s = getMultilineString(c === '"' ? '"""' : "'''");
+                }
+                else {
+                    s = getString(c);
+                }
+                token = {'type': 'string', 'value': s};
+            }
+
+            else if (c.match(regexpNameStart)) {
+                parts = [ c ]; // the name builder
+                col++;
+                while (pos < n) {
+                    c = input.charAt(pos);
+                    if (c.match(regexpName)) {
+                        parts.push(c);
+                        pos++;
+                        col++;
+                    }
+                    else break;
+                }
+                token = {'type': 'name', 'value': parts.join('')};
+            }
+
+            else if (c.match(regexpNumberAndDot)) {
+                parts = [ c ]; // the number builder
+                col++;
+                while ((pos < n) && (c != '.')) {
+                    c = input.charAt(pos);
+                    if (c.match(regexpNumberAndDot)) {
+                        parts.push(c);
+                        pos++;
+                        col++;
+                    }
+                    else break;
+                }
+                while (pos < n) {
+                    if (c.match(regexpNumber)) {
+                        parts.push(c);
+                        pos++;
+                        col++;
+                    }
+                    else break;
+                }
+                token = {'type': 'number', 'value': +parts.join('')};
+            }
+
+            else if (c === '=') {
+                col++;
+                if ((pos < n) && (input.charAt(pos) === '=')) {
+                    pos++;
+                    col++;
+                    token = {'type': 'eq'};
+                }
+                else {
+                    token = {'type': 'assign'};
+                }
+            }
+
+            else if (c === '{') {
+                token = {'type': 'lbrace'};
+                col++;
+            }
+
+            else if (c === '}') {
+                token = {'type': 'rbrace'};
+                col++;
+            }
+
+            else if (c === '[') {
+                token = {'type': 'lbracket'};
+                col++;
+            }
+
+            else if (c === ']') {
+                token = {'type': 'rbracket'};
+                col++;
+            }
+
+            else if (c === '(') {
+                token = {'type': 'lparen'};
+                col++;
+            }
+
+            else if (c === ')') {
+                token = {'type': 'rparen'};
+                col++;
+            }
+
+            else if (c === ':') {
+                token = {'type': 'colon'};
+                col++;
+            }
+
+            else if (c === ';') {
+                token = {'type': 'semicolon'};
+                col++;
+            }
+
+            else if (c === ',') {
+                token = {'type': 'comma'};
+                col++;
+            }
+
+            else if (c === '#') {
+                // a comment till the end of line
+                i = input.indexOf('\n', pos);
+                if (i < 0) {
+                    // this is the last line
+                    s = input.slice(pos);
+                }
+                else {
+                    s = input.slice(pos, i);
+                }
+                token = {'type': 'comment', 'value': s};
+                col += i - pos;
+                pos = i; // we want the next invocation to parse the trailing newline, so it correctly sets line_start, etc.
+            }
+
+            else {
+                token = {'type': 'unknown', 'value': c};
+                col++;
+            }
+
+            token['line_start'] = is_line_start;
+            token['whitespace_offset'] = count;
+            token['line'] = token_row;
+            token['column'] = token_col;
+            return token;
+        };
+    };
+
+    var parseExpression = function (token, next, output) {
+        var nextToken;
+        if (token.type === 'string') {
+            // string literal
+            escapePyString(token.value, output);
+        }
+        else if (token.type === 'number') {
+            // number literal
+            output.push('' + token.value);
+        }
+        else if (token.type === 'lbrace') {
+            // object literal
+            output.push('{');
+            if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.line);
+            if (nextToken.type === 'rbrace') {
+                output.push('}');
+                return; // end of object literal
+            }
+            for (;;) {
+                if (nextToken.type === 'string') {
+                    escapePyString(nextToken.value, output);
+                }
+                else if (nextToken.type === 'number') {
+                    output.push('' + token.value);
+                }
+                else throw new Error("Object key not a string or a number, at line " + nextToken.line + ", column " + nextToken.column);
+
+                if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.column);
+                if (nextToken.type !== 'colon') throw new Error("Expected :, at line " + nextToken.line + ", column " + nextToken.column);
+                output.push(':');
+
+                if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.column);
+                parseExpression(nextToken, next, output);
+
+                if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.column);
+                if (nextToken.type === 'comma') {
+                    output.push(',');
+                }
+                else if (nextToken.type === 'rbrace') {
+                    output.push('}');
+                    break; // end of object literal
+                }
+                else throw new Error("Expected , or }, at line " + nextToken.line + ", column " + nextToken.column);
+
+                if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.column);
+            }
+        }
+        else if (token.type === 'lbracket') {
+            // array literal
+            output.push('[');
+            if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.line);
+            if (nextToken.type === 'rbracket') {
+                output.push(']');
+                return; // end of array literal
+            }
+            for (;;) {
+                parseExpression(nextToken, next, output);
+
+                if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.column);
+                if (nextToken.type === 'comma') {
+                    output.push(',');
+                }
+                else if (nextToken.type === 'rbracket') {
+                    output.push(']');
+                    break; // end of array literal
+                }
+                else throw new Error("Expected , or ], at line " + nextToken.line + ", column " + nextToken.column);
+
+                if (!(nextToken = next())) throw new Error("Unfinished line, at line " + token.line + ", column " + token.column);
+            }
+        }
+        else throw new Error("Unexpected token: expected a string, number, object, or array, at line " + token.line + ", column " + token.column);
+    };
+
+    codeq.parseDefinition = function (definition) {
+        var next = tokenize(definition),
+            vars = { 'description': true, 'hint': true },
+            parts = [ ], // first element is just a placeholder, to be replaced with the full "var" declaration at the end
+            token, first_token, varname, v, fn, obj;
+
+        while (token = next()) {
+            // parse line by line
+            if (!token.line_start) throw new Error("The token does not start in a new line, at line " + token.line + ", column " + token.column);
+            if (token.type === 'comment') continue;
+            if (token.whitespace_offset > 0) throw new Error("Cannot parse indented lines, at line " + token.line);
+
+            // parse lvalue
+            if (token.type !== 'name') throw new Error("Expected a lvalue, at line " + token.line + ", column " + token.column);
+            varname = token.value;
+            vars[varname] = true; // remember the variable name, so we will declare it at the end
+            parts.push(';\n'); // close the previous line -- the first time this is wrong because there is no previous line yet, but we will replace it with a variable declaration
+            parts.push(varname); // start the new line with the assignment statement -- the only statement we support
+            first_token = token;
+            if (!(token = next())) throw new Error("Unfinished line, at line " + first_token.line);
+
+            // optional index
+            if (token.type === 'lbracket') {
+                // index operator
+                parts.push('[');
+                if (!(token = next())) throw new Error("Unfinished line, at line " + first_token.line);
+                if (token.type === 'string') escapePyString(token.value, parts);
+                else if (token.type === 'number') parts.push('' + token.value);
+                else throw new Error("Unsupported index expression, at line " + token.line + ", column " + token.column);
+                if (!(token = next())) throw new Error("Unfinished line, at line " + first_token.line);
+                if (token.type !== 'rbracket') throw new Error("Expected ], at line " + token.line + ", column " + token.column + ", token " + token.type);
+                parts.push(']');
+                if (!(token = next())) throw new Error("Unfinished line, at line " + first_token.line);
+            }
+
+            // the assignment operator
+            if (token.type !== 'assign') throw new Error("Expected =, at line " + token.line + ", column " + token.column);
+            parts.push('=');
+            if (!(token = next())) throw new Error("Unfinished line, at line " + first_token.line);
+
+            // parse rvalue
+            parseExpression(token, next, parts);
+        }
+
+        if (parts.length === 0) return {}; // empty definition
+
+        v = [];
+        for (varname in vars) {
+            if (vars.hasOwnProperty(varname)) v.push(varname);
+        }
+
+        parts[0] = 'var ' + v.join(', ') + ';\n';
+        parts.push(';\n__params__.description = description;\n__params__.hint = hint;');
+        v = parts.join('');
+        codeq.log.debug("Creating a new parseInfo function having the body:\n" + v);
+        fn = new Function("__params__", v);
+        obj = {};
+        fn(obj);
+        return obj; // obj now contains "description" and "hint"
+    }; // parseDefinition
+
+})();
+\ No newline at end of file
author	Aleš Smodiš <aless@guru.si>	2015-07-14 17:57:06 +0200
committer	Aleš Smodiš <aless@guru.si>	2015-07-14 17:57:06 +0200
commit	13399da7f5eea223db86ff4d30403bc714c5816c (patch)
tree	f49dd400b20c3ff84931994d9d99e5f58cc2be0c /js
parent	9a9c1113c90cf787a82312a548f2bf3776927d8e (diff)