These days, I was working improving my SimpleScript compiler to JavaScript. Today, I want to comment the implementation of the lexer, the repo is
http://github.com/ajlopez/SimpleScript
Now, the lexer code resides in a dedicated file lib/lexer.js, that expose a module, that can be consumed from Node.js and from the browser. It starts defining the token types:
var lexer = (function () { var TokenType = { Name: 1, Integer: 2, Real: 3, String: 4, NewLine: 5, Separator: 6, Assignment: 7 };
Then, it defines some operatores, delimiters, and the Token, with two elements, type and value.
var separators = ".,()[]"; var assignments = ["=", "+=", "-=", "*=", "/="]; var operators = ["+", "-", "*", "/", "==", "!=", "<", ">", "<=", ">="]; function Token(value, type) { this.value = value; this.type = type; }
The main job is in the Lexer “class”, with the method nextToken:
function Lexer(text) { var length = text ? text.length : 0; var position = 0; var next = []; this.nextToken = function () { if (next.length > 0) return next.pop(); skipSpaces(); var ch = nextChar(); if (ch === null) return null; if (ch === '"' || ch === "'") return nextString(ch); if (ch === '\n') return new Token(ch, TokenType.NewLine); if (ch === '\r') { var ch2 = nextChar(); if (ch2 === '\n') return new Token(ch + ch2, TokenType.NewLine); if (ch2) pushChar(ch2); return new Token(ch, TokenType.NewLine); } if (isAssignment(ch)) return new Token(ch, TokenType.Assignment); if (isOperator(ch)) return nextOperator(ch); if (isSeparator(ch)) return new Token(ch, TokenType.Separator); if (isFirstCharOfName(ch)) return nextName(ch); if (isDigit(ch)) return nextInteger(ch); }
Finally, the module expose a lexer factory and the enumeration of token types:
return { lexer: function (text) { return new Lexer(text); }, TokenType: TokenType }
The code was developed using Test-Driven Development workflow. There is a file test/lexer.js, fragment:
function getToken(text, value, type, test) { var lexer = sslexer.lexer(text); var token = lexer.nextToken(); test.ok(token); test.equal(token.value, value); test.equal(token.type, type); test.equal(lexer.nextToken(), null); }; exports['Get names'] = function (test) { getToken('foo', 'foo', TokenType.Name, test); getToken('foo123', 'foo123', TokenType.Name, test); getToken('foo_123', 'foo_123', TokenType.Name, test); getToken('_foo', '_foo', TokenType.Name, test); } exports['Get integer'] = function (test) { getToken('123', '123', TokenType.Integer, test); getToken('1234567890', '1234567890', TokenType.Integer, test); }
Remember: no TDD, no paradise 😉
Next topics: the parser, commands and expressions implementations, compilation to JavaScript.
Stay tuned!
Angel “Java” Lopez