diff options
| author | Andrew Kelley <superjoe30@gmail.com> | 2018-02-09 13:08:02 -0500 |
|---|---|---|
| committer | Andrew Kelley <superjoe30@gmail.com> | 2018-02-09 13:08:02 -0500 |
| commit | 1fb308ceeea0259ad021d67945ea5adc10960a85 (patch) | |
| tree | eceb252e06a6ed0cc179bc4cdf5a698057da6761 /src-self-hosted/tokenizer.zig | |
| parent | 3919afcad26d2359efe52f98cd4f2f0573527369 (diff) | |
| download | zig-1fb308ceeea0259ad021d67945ea5adc10960a85.tar.gz zig-1fb308ceeea0259ad021d67945ea5adc10960a85.zip | |
self hosted compiler: move tokenization and parsing to std lib
Diffstat (limited to 'src-self-hosted/tokenizer.zig')
| -rw-r--r-- | src-self-hosted/tokenizer.zig | 659 |
1 files changed, 0 insertions, 659 deletions
diff --git a/src-self-hosted/tokenizer.zig b/src-self-hosted/tokenizer.zig deleted file mode 100644 index f1f87abfc9..0000000000 --- a/src-self-hosted/tokenizer.zig +++ /dev/null @@ -1,659 +0,0 @@ -const std = @import("std"); -const mem = std.mem; - -pub const Token = struct { - id: Id, - start: usize, - end: usize, - - const KeywordId = struct { - bytes: []const u8, - id: Id, - }; - - const keywords = []KeywordId { - KeywordId{.bytes="align", .id = Id.Keyword_align}, - KeywordId{.bytes="and", .id = Id.Keyword_and}, - KeywordId{.bytes="asm", .id = Id.Keyword_asm}, - KeywordId{.bytes="break", .id = Id.Keyword_break}, - KeywordId{.bytes="comptime", .id = Id.Keyword_comptime}, - KeywordId{.bytes="const", .id = Id.Keyword_const}, - KeywordId{.bytes="continue", .id = Id.Keyword_continue}, - KeywordId{.bytes="defer", .id = Id.Keyword_defer}, - KeywordId{.bytes="else", .id = Id.Keyword_else}, - KeywordId{.bytes="enum", .id = Id.Keyword_enum}, - KeywordId{.bytes="error", .id = Id.Keyword_error}, - KeywordId{.bytes="export", .id = Id.Keyword_export}, - KeywordId{.bytes="extern", .id = Id.Keyword_extern}, - KeywordId{.bytes="false", .id = Id.Keyword_false}, - KeywordId{.bytes="fn", .id = Id.Keyword_fn}, - KeywordId{.bytes="for", .id = Id.Keyword_for}, - KeywordId{.bytes="goto", .id = Id.Keyword_goto}, - KeywordId{.bytes="if", .id = Id.Keyword_if}, - KeywordId{.bytes="inline", .id = Id.Keyword_inline}, - KeywordId{.bytes="nakedcc", .id = Id.Keyword_nakedcc}, - KeywordId{.bytes="noalias", .id = Id.Keyword_noalias}, - KeywordId{.bytes="null", .id = Id.Keyword_null}, - KeywordId{.bytes="or", .id = Id.Keyword_or}, - KeywordId{.bytes="packed", .id = Id.Keyword_packed}, - KeywordId{.bytes="pub", .id = Id.Keyword_pub}, - KeywordId{.bytes="return", .id = Id.Keyword_return}, - KeywordId{.bytes="stdcallcc", .id = Id.Keyword_stdcallcc}, - KeywordId{.bytes="struct", .id = Id.Keyword_struct}, - KeywordId{.bytes="switch", .id = Id.Keyword_switch}, - KeywordId{.bytes="test", .id = Id.Keyword_test}, - KeywordId{.bytes="this", .id = Id.Keyword_this}, - KeywordId{.bytes="true", .id = Id.Keyword_true}, - KeywordId{.bytes="undefined", .id = Id.Keyword_undefined}, - KeywordId{.bytes="union", .id = Id.Keyword_union}, - KeywordId{.bytes="unreachable", .id = Id.Keyword_unreachable}, - KeywordId{.bytes="use", .id = Id.Keyword_use}, - KeywordId{.bytes="var", .id = Id.Keyword_var}, - KeywordId{.bytes="volatile", .id = Id.Keyword_volatile}, - KeywordId{.bytes="while", .id = Id.Keyword_while}, - }; - - fn getKeyword(bytes: []const u8) ?Id { - for (keywords) |kw| { - if (mem.eql(u8, kw.bytes, bytes)) { - return kw.id; - } - } - return null; - } - - const StrLitKind = enum {Normal, C}; - - pub const Id = union(enum) { - Invalid, - Identifier, - StringLiteral: StrLitKind, - Eof, - Builtin, - Bang, - Equal, - EqualEqual, - BangEqual, - LParen, - RParen, - Semicolon, - Percent, - LBrace, - RBrace, - Period, - Ellipsis2, - Ellipsis3, - Minus, - Arrow, - Colon, - Slash, - Comma, - Ampersand, - AmpersandEqual, - IntegerLiteral, - FloatLiteral, - Keyword_align, - Keyword_and, - Keyword_asm, - Keyword_break, - Keyword_comptime, - Keyword_const, - Keyword_continue, - Keyword_defer, - Keyword_else, - Keyword_enum, - Keyword_error, - Keyword_export, - Keyword_extern, - Keyword_false, - Keyword_fn, - Keyword_for, - Keyword_goto, - Keyword_if, - Keyword_inline, - Keyword_nakedcc, - Keyword_noalias, - Keyword_null, - Keyword_or, - Keyword_packed, - Keyword_pub, - Keyword_return, - Keyword_stdcallcc, - Keyword_struct, - Keyword_switch, - Keyword_test, - Keyword_this, - Keyword_true, - Keyword_undefined, - Keyword_union, - Keyword_unreachable, - Keyword_use, - Keyword_var, - Keyword_volatile, - Keyword_while, - }; -}; - -pub const Tokenizer = struct { - buffer: []const u8, - index: usize, - pending_invalid_token: ?Token, - - pub const Location = struct { - line: usize, - column: usize, - line_start: usize, - line_end: usize, - }; - - pub fn getTokenLocation(self: &Tokenizer, token: &const Token) Location { - var loc = Location { - .line = 0, - .column = 0, - .line_start = 0, - .line_end = 0, - }; - for (self.buffer) |c, i| { - if (i == token.start) { - loc.line_end = i; - while (loc.line_end < self.buffer.len and self.buffer[loc.line_end] != '\n') : (loc.line_end += 1) {} - return loc; - } - if (c == '\n') { - loc.line += 1; - loc.column = 0; - loc.line_start = i + 1; - } else { - loc.column += 1; - } - } - return loc; - } - - /// For debugging purposes - pub fn dump(self: &Tokenizer, token: &const Token) void { - std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]); - } - - /// buffer must end with "\n\n\n". This is so that attempting to decode - /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow. - pub fn init(buffer: []const u8) Tokenizer { - std.debug.assert(buffer[buffer.len - 1] == '\n'); - std.debug.assert(buffer[buffer.len - 2] == '\n'); - std.debug.assert(buffer[buffer.len - 3] == '\n'); - return Tokenizer { - .buffer = buffer, - .index = 0, - .pending_invalid_token = null, - }; - } - - const State = enum { - Start, - Identifier, - Builtin, - C, - StringLiteral, - StringLiteralBackslash, - Equal, - Bang, - Minus, - Slash, - LineComment, - Zero, - IntegerLiteral, - IntegerLiteralWithRadix, - NumberDot, - FloatFraction, - FloatExponentUnsigned, - FloatExponentNumber, - Ampersand, - Period, - Period2, - }; - - pub fn next(self: &Tokenizer) Token { - if (self.pending_invalid_token) |token| { - self.pending_invalid_token = null; - return token; - } - var state = State.Start; - var result = Token { - .id = Token.Id.Eof, - .start = self.index, - .end = undefined, - }; - while (self.index < self.buffer.len) : (self.index += 1) { - const c = self.buffer[self.index]; - switch (state) { - State.Start => switch (c) { - ' ', '\n' => { - result.start = self.index + 1; - }, - 'c' => { - state = State.C; - result.id = Token.Id.Identifier; - }, - '"' => { - state = State.StringLiteral; - result.id = Token.Id { .StringLiteral = Token.StrLitKind.Normal }; - }, - 'a'...'b', 'd'...'z', 'A'...'Z', '_' => { - state = State.Identifier; - result.id = Token.Id.Identifier; - }, - '@' => { - state = State.Builtin; - result.id = Token.Id.Builtin; - }, - '=' => { - state = State.Equal; - }, - '!' => { - state = State.Bang; - }, - '(' => { - result.id = Token.Id.LParen; - self.index += 1; - break; - }, - ')' => { - result.id = Token.Id.RParen; - self.index += 1; - break; - }, - ';' => { - result.id = Token.Id.Semicolon; - self.index += 1; - break; - }, - ',' => { - result.id = Token.Id.Comma; - self.index += 1; - break; - }, - ':' => { - result.id = Token.Id.Colon; - self.index += 1; - break; - }, - '%' => { - result.id = Token.Id.Percent; - self.index += 1; - break; - }, - '{' => { - result.id = Token.Id.LBrace; - self.index += 1; - break; - }, - '}' => { - result.id = Token.Id.RBrace; - self.index += 1; - break; - }, - '.' => { - state = State.Period; - }, - '-' => { - state = State.Minus; - }, - '/' => { - state = State.Slash; - }, - '&' => { - state = State.Ampersand; - }, - '0' => { - state = State.Zero; - result.id = Token.Id.IntegerLiteral; - }, - '1'...'9' => { - state = State.IntegerLiteral; - result.id = Token.Id.IntegerLiteral; - }, - else => { - result.id = Token.Id.Invalid; - self.index += 1; - break; - }, - }, - State.Ampersand => switch (c) { - '=' => { - result.id = Token.Id.AmpersandEqual; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Ampersand; - break; - }, - }, - State.Identifier => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, - else => { - if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { - result.id = id; - } - break; - }, - }, - State.Builtin => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, - else => break, - }, - State.C => switch (c) { - '\\' => @panic("TODO"), - '"' => { - state = State.StringLiteral; - result.id = Token.Id { .StringLiteral = Token.StrLitKind.C }; - }, - 'a'...'z', 'A'...'Z', '_', '0'...'9' => { - state = State.Identifier; - }, - else => break, - }, - State.StringLiteral => switch (c) { - '\\' => { - state = State.StringLiteralBackslash; - }, - '"' => { - self.index += 1; - break; - }, - '\n' => break, // Look for this error later. - else => self.checkLiteralCharacter(), - }, - - State.StringLiteralBackslash => switch (c) { - '\n' => break, // Look for this error later. - else => { - state = State.StringLiteral; - }, - }, - - State.Bang => switch (c) { - '=' => { - result.id = Token.Id.BangEqual; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Bang; - break; - }, - }, - - State.Equal => switch (c) { - '=' => { - result.id = Token.Id.EqualEqual; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Equal; - break; - }, - }, - - State.Minus => switch (c) { - '>' => { - result.id = Token.Id.Arrow; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Minus; - break; - }, - }, - - State.Period => switch (c) { - '.' => { - state = State.Period2; - }, - else => { - result.id = Token.Id.Period; - break; - }, - }, - - State.Period2 => switch (c) { - '.' => { - result.id = Token.Id.Ellipsis3; - self.index += 1; - break; - }, - else => { - result.id = Token.Id.Ellipsis2; - break; - }, - }, - - State.Slash => switch (c) { - '/' => { - result.id = undefined; - state = State.LineComment; - }, - else => { - result.id = Token.Id.Slash; - break; - }, - }, - State.LineComment => switch (c) { - '\n' => { - state = State.Start; - result = Token { - .id = Token.Id.Eof, - .start = self.index + 1, - .end = undefined, - }; - }, - else => self.checkLiteralCharacter(), - }, - State.Zero => switch (c) { - 'b', 'o', 'x' => { - state = State.IntegerLiteralWithRadix; - }, - else => { - // reinterpret as a normal number - self.index -= 1; - state = State.IntegerLiteral; - }, - }, - State.IntegerLiteral => switch (c) { - '.' => { - state = State.NumberDot; - }, - 'p', 'P', 'e', 'E' => { - state = State.FloatExponentUnsigned; - }, - '0'...'9' => {}, - else => break, - }, - State.IntegerLiteralWithRadix => switch (c) { - '.' => { - state = State.NumberDot; - }, - 'p', 'P' => { - state = State.FloatExponentUnsigned; - }, - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => break, - }, - State.NumberDot => switch (c) { - '.' => { - self.index -= 1; - state = State.Start; - break; - }, - else => { - self.index -= 1; - result.id = Token.Id.FloatLiteral; - state = State.FloatFraction; - }, - }, - State.FloatFraction => switch (c) { - 'p', 'P' => { - state = State.FloatExponentUnsigned; - }, - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => break, - }, - State.FloatExponentUnsigned => switch (c) { - '+', '-' => { - state = State.FloatExponentNumber; - }, - else => { - // reinterpret as a normal exponent number - self.index -= 1; - state = State.FloatExponentNumber; - } - }, - State.FloatExponentNumber => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => {}, - else => break, - }, - } - } - result.end = self.index; - - if (result.id == Token.Id.Eof) { - if (self.pending_invalid_token) |token| { - self.pending_invalid_token = null; - return token; - } - } - - return result; - } - - pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) []const u8 { - return self.buffer[token.start..token.end]; - } - - fn checkLiteralCharacter(self: &Tokenizer) void { - if (self.pending_invalid_token != null) return; - const invalid_length = self.getInvalidCharacterLength(); - if (invalid_length == 0) return; - self.pending_invalid_token = Token { - .id = Token.Id.Invalid, - .start = self.index, - .end = self.index + invalid_length, - }; - } - - fn getInvalidCharacterLength(self: &Tokenizer) u3 { - const c0 = self.buffer[self.index]; - if (c0 < 0x80) { - if (c0 < 0x20 or c0 == 0x7f) { - // ascii control codes are never allowed - // (note that \n was checked before we got here) - return 1; - } - // looks fine to me. - return 0; - } else { - // check utf8-encoded character. - const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; - // the last 3 bytes in the buffer are guaranteed to be '\n', - // which means we don't need to do any bounds checking here. - const bytes = self.buffer[self.index..self.index + length]; - switch (length) { - 2 => { - const value = std.unicode.utf8Decode2(bytes) catch return length; - if (value == 0x85) return length; // U+0085 (NEL) - }, - 3 => { - const value = std.unicode.utf8Decode3(bytes) catch return length; - if (value == 0x2028) return length; // U+2028 (LS) - if (value == 0x2029) return length; // U+2029 (PS) - }, - 4 => { - _ = std.unicode.utf8Decode4(bytes) catch return length; - }, - else => unreachable, - } - self.index += length - 1; - return 0; - } - } -}; - - - -test "tokenizer" { - testTokenize("test", []Token.Id { - Token.Id.Keyword_test, - }); -} - -test "tokenizer - invalid token characters" { - testTokenize("#", []Token.Id{Token.Id.Invalid}); - testTokenize("`", []Token.Id{Token.Id.Invalid}); -} - -test "tokenizer - invalid literal/comment characters" { - testTokenize("\"\x00\"", []Token.Id { - Token.Id { .StringLiteral = Token.StrLitKind.Normal }, - Token.Id.Invalid, - }); - testTokenize("//\x00", []Token.Id { - Token.Id.Invalid, - }); - testTokenize("//\x1f", []Token.Id { - Token.Id.Invalid, - }); - testTokenize("//\x7f", []Token.Id { - Token.Id.Invalid, - }); -} - -test "tokenizer - utf8" { - testTokenize("//\xc2\x80", []Token.Id{}); - testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{}); -} - -test "tokenizer - invalid utf8" { - testTokenize("//\x80", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xbf", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xf8", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xff", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xe0", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xf0", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid}); -} - -test "tokenizer - illegal unicode codepoints" { - // unicode newline characters.U+0085, U+2028, U+2029 - testTokenize("//\xc2\x84", []Token.Id{}); - testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xc2\x86", []Token.Id{}); - testTokenize("//\xe2\x80\xa7", []Token.Id{}); - testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid}); - testTokenize("//\xe2\x80\xaa", []Token.Id{}); -} - -fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { - // (test authors, just make this bigger if you need it) - var padded_source: [0x100]u8 = undefined; - std.mem.copy(u8, padded_source[0..source.len], source); - padded_source[source.len + 0] = '\n'; - padded_source[source.len + 1] = '\n'; - padded_source[source.len + 2] = '\n'; - - var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]); - for (expected_tokens) |expected_token_id| { - const token = tokenizer.next(); - std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id)); - switch (expected_token_id) { - Token.Id.StringLiteral => |expected_kind| { - std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable }); - }, - else => {}, - } - } - std.debug.assert(tokenizer.next().id == Token.Id.Eof); -} |
