aboutsummaryrefslogtreecommitdiff
path: root/src-self-hosted/tokenizer.zig
diff options
context:
space:
mode:
authorAndrew Kelley <superjoe30@gmail.com>2018-02-09 13:08:02 -0500
committerAndrew Kelley <superjoe30@gmail.com>2018-02-09 13:08:02 -0500
commit1fb308ceeea0259ad021d67945ea5adc10960a85 (patch)
treeeceb252e06a6ed0cc179bc4cdf5a698057da6761 /src-self-hosted/tokenizer.zig
parent3919afcad26d2359efe52f98cd4f2f0573527369 (diff)
downloadzig-1fb308ceeea0259ad021d67945ea5adc10960a85.tar.gz
zig-1fb308ceeea0259ad021d67945ea5adc10960a85.zip
self hosted compiler: move tokenization and parsing to std lib
Diffstat (limited to 'src-self-hosted/tokenizer.zig')
-rw-r--r--src-self-hosted/tokenizer.zig659
1 files changed, 0 insertions, 659 deletions
diff --git a/src-self-hosted/tokenizer.zig b/src-self-hosted/tokenizer.zig
deleted file mode 100644
index f1f87abfc9..0000000000
--- a/src-self-hosted/tokenizer.zig
+++ /dev/null
@@ -1,659 +0,0 @@
-const std = @import("std");
-const mem = std.mem;
-
-pub const Token = struct {
- id: Id,
- start: usize,
- end: usize,
-
- const KeywordId = struct {
- bytes: []const u8,
- id: Id,
- };
-
- const keywords = []KeywordId {
- KeywordId{.bytes="align", .id = Id.Keyword_align},
- KeywordId{.bytes="and", .id = Id.Keyword_and},
- KeywordId{.bytes="asm", .id = Id.Keyword_asm},
- KeywordId{.bytes="break", .id = Id.Keyword_break},
- KeywordId{.bytes="comptime", .id = Id.Keyword_comptime},
- KeywordId{.bytes="const", .id = Id.Keyword_const},
- KeywordId{.bytes="continue", .id = Id.Keyword_continue},
- KeywordId{.bytes="defer", .id = Id.Keyword_defer},
- KeywordId{.bytes="else", .id = Id.Keyword_else},
- KeywordId{.bytes="enum", .id = Id.Keyword_enum},
- KeywordId{.bytes="error", .id = Id.Keyword_error},
- KeywordId{.bytes="export", .id = Id.Keyword_export},
- KeywordId{.bytes="extern", .id = Id.Keyword_extern},
- KeywordId{.bytes="false", .id = Id.Keyword_false},
- KeywordId{.bytes="fn", .id = Id.Keyword_fn},
- KeywordId{.bytes="for", .id = Id.Keyword_for},
- KeywordId{.bytes="goto", .id = Id.Keyword_goto},
- KeywordId{.bytes="if", .id = Id.Keyword_if},
- KeywordId{.bytes="inline", .id = Id.Keyword_inline},
- KeywordId{.bytes="nakedcc", .id = Id.Keyword_nakedcc},
- KeywordId{.bytes="noalias", .id = Id.Keyword_noalias},
- KeywordId{.bytes="null", .id = Id.Keyword_null},
- KeywordId{.bytes="or", .id = Id.Keyword_or},
- KeywordId{.bytes="packed", .id = Id.Keyword_packed},
- KeywordId{.bytes="pub", .id = Id.Keyword_pub},
- KeywordId{.bytes="return", .id = Id.Keyword_return},
- KeywordId{.bytes="stdcallcc", .id = Id.Keyword_stdcallcc},
- KeywordId{.bytes="struct", .id = Id.Keyword_struct},
- KeywordId{.bytes="switch", .id = Id.Keyword_switch},
- KeywordId{.bytes="test", .id = Id.Keyword_test},
- KeywordId{.bytes="this", .id = Id.Keyword_this},
- KeywordId{.bytes="true", .id = Id.Keyword_true},
- KeywordId{.bytes="undefined", .id = Id.Keyword_undefined},
- KeywordId{.bytes="union", .id = Id.Keyword_union},
- KeywordId{.bytes="unreachable", .id = Id.Keyword_unreachable},
- KeywordId{.bytes="use", .id = Id.Keyword_use},
- KeywordId{.bytes="var", .id = Id.Keyword_var},
- KeywordId{.bytes="volatile", .id = Id.Keyword_volatile},
- KeywordId{.bytes="while", .id = Id.Keyword_while},
- };
-
- fn getKeyword(bytes: []const u8) ?Id {
- for (keywords) |kw| {
- if (mem.eql(u8, kw.bytes, bytes)) {
- return kw.id;
- }
- }
- return null;
- }
-
- const StrLitKind = enum {Normal, C};
-
- pub const Id = union(enum) {
- Invalid,
- Identifier,
- StringLiteral: StrLitKind,
- Eof,
- Builtin,
- Bang,
- Equal,
- EqualEqual,
- BangEqual,
- LParen,
- RParen,
- Semicolon,
- Percent,
- LBrace,
- RBrace,
- Period,
- Ellipsis2,
- Ellipsis3,
- Minus,
- Arrow,
- Colon,
- Slash,
- Comma,
- Ampersand,
- AmpersandEqual,
- IntegerLiteral,
- FloatLiteral,
- Keyword_align,
- Keyword_and,
- Keyword_asm,
- Keyword_break,
- Keyword_comptime,
- Keyword_const,
- Keyword_continue,
- Keyword_defer,
- Keyword_else,
- Keyword_enum,
- Keyword_error,
- Keyword_export,
- Keyword_extern,
- Keyword_false,
- Keyword_fn,
- Keyword_for,
- Keyword_goto,
- Keyword_if,
- Keyword_inline,
- Keyword_nakedcc,
- Keyword_noalias,
- Keyword_null,
- Keyword_or,
- Keyword_packed,
- Keyword_pub,
- Keyword_return,
- Keyword_stdcallcc,
- Keyword_struct,
- Keyword_switch,
- Keyword_test,
- Keyword_this,
- Keyword_true,
- Keyword_undefined,
- Keyword_union,
- Keyword_unreachable,
- Keyword_use,
- Keyword_var,
- Keyword_volatile,
- Keyword_while,
- };
-};
-
-pub const Tokenizer = struct {
- buffer: []const u8,
- index: usize,
- pending_invalid_token: ?Token,
-
- pub const Location = struct {
- line: usize,
- column: usize,
- line_start: usize,
- line_end: usize,
- };
-
- pub fn getTokenLocation(self: &Tokenizer, token: &const Token) Location {
- var loc = Location {
- .line = 0,
- .column = 0,
- .line_start = 0,
- .line_end = 0,
- };
- for (self.buffer) |c, i| {
- if (i == token.start) {
- loc.line_end = i;
- while (loc.line_end < self.buffer.len and self.buffer[loc.line_end] != '\n') : (loc.line_end += 1) {}
- return loc;
- }
- if (c == '\n') {
- loc.line += 1;
- loc.column = 0;
- loc.line_start = i + 1;
- } else {
- loc.column += 1;
- }
- }
- return loc;
- }
-
- /// For debugging purposes
- pub fn dump(self: &Tokenizer, token: &const Token) void {
- std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]);
- }
-
- /// buffer must end with "\n\n\n". This is so that attempting to decode
- /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow.
- pub fn init(buffer: []const u8) Tokenizer {
- std.debug.assert(buffer[buffer.len - 1] == '\n');
- std.debug.assert(buffer[buffer.len - 2] == '\n');
- std.debug.assert(buffer[buffer.len - 3] == '\n');
- return Tokenizer {
- .buffer = buffer,
- .index = 0,
- .pending_invalid_token = null,
- };
- }
-
- const State = enum {
- Start,
- Identifier,
- Builtin,
- C,
- StringLiteral,
- StringLiteralBackslash,
- Equal,
- Bang,
- Minus,
- Slash,
- LineComment,
- Zero,
- IntegerLiteral,
- IntegerLiteralWithRadix,
- NumberDot,
- FloatFraction,
- FloatExponentUnsigned,
- FloatExponentNumber,
- Ampersand,
- Period,
- Period2,
- };
-
- pub fn next(self: &Tokenizer) Token {
- if (self.pending_invalid_token) |token| {
- self.pending_invalid_token = null;
- return token;
- }
- var state = State.Start;
- var result = Token {
- .id = Token.Id.Eof,
- .start = self.index,
- .end = undefined,
- };
- while (self.index < self.buffer.len) : (self.index += 1) {
- const c = self.buffer[self.index];
- switch (state) {
- State.Start => switch (c) {
- ' ', '\n' => {
- result.start = self.index + 1;
- },
- 'c' => {
- state = State.C;
- result.id = Token.Id.Identifier;
- },
- '"' => {
- state = State.StringLiteral;
- result.id = Token.Id { .StringLiteral = Token.StrLitKind.Normal };
- },
- 'a'...'b', 'd'...'z', 'A'...'Z', '_' => {
- state = State.Identifier;
- result.id = Token.Id.Identifier;
- },
- '@' => {
- state = State.Builtin;
- result.id = Token.Id.Builtin;
- },
- '=' => {
- state = State.Equal;
- },
- '!' => {
- state = State.Bang;
- },
- '(' => {
- result.id = Token.Id.LParen;
- self.index += 1;
- break;
- },
- ')' => {
- result.id = Token.Id.RParen;
- self.index += 1;
- break;
- },
- ';' => {
- result.id = Token.Id.Semicolon;
- self.index += 1;
- break;
- },
- ',' => {
- result.id = Token.Id.Comma;
- self.index += 1;
- break;
- },
- ':' => {
- result.id = Token.Id.Colon;
- self.index += 1;
- break;
- },
- '%' => {
- result.id = Token.Id.Percent;
- self.index += 1;
- break;
- },
- '{' => {
- result.id = Token.Id.LBrace;
- self.index += 1;
- break;
- },
- '}' => {
- result.id = Token.Id.RBrace;
- self.index += 1;
- break;
- },
- '.' => {
- state = State.Period;
- },
- '-' => {
- state = State.Minus;
- },
- '/' => {
- state = State.Slash;
- },
- '&' => {
- state = State.Ampersand;
- },
- '0' => {
- state = State.Zero;
- result.id = Token.Id.IntegerLiteral;
- },
- '1'...'9' => {
- state = State.IntegerLiteral;
- result.id = Token.Id.IntegerLiteral;
- },
- else => {
- result.id = Token.Id.Invalid;
- self.index += 1;
- break;
- },
- },
- State.Ampersand => switch (c) {
- '=' => {
- result.id = Token.Id.AmpersandEqual;
- self.index += 1;
- break;
- },
- else => {
- result.id = Token.Id.Ampersand;
- break;
- },
- },
- State.Identifier => switch (c) {
- 'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
- else => {
- if (Token.getKeyword(self.buffer[result.start..self.index])) |id| {
- result.id = id;
- }
- break;
- },
- },
- State.Builtin => switch (c) {
- 'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
- else => break,
- },
- State.C => switch (c) {
- '\\' => @panic("TODO"),
- '"' => {
- state = State.StringLiteral;
- result.id = Token.Id { .StringLiteral = Token.StrLitKind.C };
- },
- 'a'...'z', 'A'...'Z', '_', '0'...'9' => {
- state = State.Identifier;
- },
- else => break,
- },
- State.StringLiteral => switch (c) {
- '\\' => {
- state = State.StringLiteralBackslash;
- },
- '"' => {
- self.index += 1;
- break;
- },
- '\n' => break, // Look for this error later.
- else => self.checkLiteralCharacter(),
- },
-
- State.StringLiteralBackslash => switch (c) {
- '\n' => break, // Look for this error later.
- else => {
- state = State.StringLiteral;
- },
- },
-
- State.Bang => switch (c) {
- '=' => {
- result.id = Token.Id.BangEqual;
- self.index += 1;
- break;
- },
- else => {
- result.id = Token.Id.Bang;
- break;
- },
- },
-
- State.Equal => switch (c) {
- '=' => {
- result.id = Token.Id.EqualEqual;
- self.index += 1;
- break;
- },
- else => {
- result.id = Token.Id.Equal;
- break;
- },
- },
-
- State.Minus => switch (c) {
- '>' => {
- result.id = Token.Id.Arrow;
- self.index += 1;
- break;
- },
- else => {
- result.id = Token.Id.Minus;
- break;
- },
- },
-
- State.Period => switch (c) {
- '.' => {
- state = State.Period2;
- },
- else => {
- result.id = Token.Id.Period;
- break;
- },
- },
-
- State.Period2 => switch (c) {
- '.' => {
- result.id = Token.Id.Ellipsis3;
- self.index += 1;
- break;
- },
- else => {
- result.id = Token.Id.Ellipsis2;
- break;
- },
- },
-
- State.Slash => switch (c) {
- '/' => {
- result.id = undefined;
- state = State.LineComment;
- },
- else => {
- result.id = Token.Id.Slash;
- break;
- },
- },
- State.LineComment => switch (c) {
- '\n' => {
- state = State.Start;
- result = Token {
- .id = Token.Id.Eof,
- .start = self.index + 1,
- .end = undefined,
- };
- },
- else => self.checkLiteralCharacter(),
- },
- State.Zero => switch (c) {
- 'b', 'o', 'x' => {
- state = State.IntegerLiteralWithRadix;
- },
- else => {
- // reinterpret as a normal number
- self.index -= 1;
- state = State.IntegerLiteral;
- },
- },
- State.IntegerLiteral => switch (c) {
- '.' => {
- state = State.NumberDot;
- },
- 'p', 'P', 'e', 'E' => {
- state = State.FloatExponentUnsigned;
- },
- '0'...'9' => {},
- else => break,
- },
- State.IntegerLiteralWithRadix => switch (c) {
- '.' => {
- state = State.NumberDot;
- },
- 'p', 'P' => {
- state = State.FloatExponentUnsigned;
- },
- '0'...'9', 'a'...'f', 'A'...'F' => {},
- else => break,
- },
- State.NumberDot => switch (c) {
- '.' => {
- self.index -= 1;
- state = State.Start;
- break;
- },
- else => {
- self.index -= 1;
- result.id = Token.Id.FloatLiteral;
- state = State.FloatFraction;
- },
- },
- State.FloatFraction => switch (c) {
- 'p', 'P' => {
- state = State.FloatExponentUnsigned;
- },
- '0'...'9', 'a'...'f', 'A'...'F' => {},
- else => break,
- },
- State.FloatExponentUnsigned => switch (c) {
- '+', '-' => {
- state = State.FloatExponentNumber;
- },
- else => {
- // reinterpret as a normal exponent number
- self.index -= 1;
- state = State.FloatExponentNumber;
- }
- },
- State.FloatExponentNumber => switch (c) {
- '0'...'9', 'a'...'f', 'A'...'F' => {},
- else => break,
- },
- }
- }
- result.end = self.index;
-
- if (result.id == Token.Id.Eof) {
- if (self.pending_invalid_token) |token| {
- self.pending_invalid_token = null;
- return token;
- }
- }
-
- return result;
- }
-
- pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) []const u8 {
- return self.buffer[token.start..token.end];
- }
-
- fn checkLiteralCharacter(self: &Tokenizer) void {
- if (self.pending_invalid_token != null) return;
- const invalid_length = self.getInvalidCharacterLength();
- if (invalid_length == 0) return;
- self.pending_invalid_token = Token {
- .id = Token.Id.Invalid,
- .start = self.index,
- .end = self.index + invalid_length,
- };
- }
-
- fn getInvalidCharacterLength(self: &Tokenizer) u3 {
- const c0 = self.buffer[self.index];
- if (c0 < 0x80) {
- if (c0 < 0x20 or c0 == 0x7f) {
- // ascii control codes are never allowed
- // (note that \n was checked before we got here)
- return 1;
- }
- // looks fine to me.
- return 0;
- } else {
- // check utf8-encoded character.
- const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
- // the last 3 bytes in the buffer are guaranteed to be '\n',
- // which means we don't need to do any bounds checking here.
- const bytes = self.buffer[self.index..self.index + length];
- switch (length) {
- 2 => {
- const value = std.unicode.utf8Decode2(bytes) catch return length;
- if (value == 0x85) return length; // U+0085 (NEL)
- },
- 3 => {
- const value = std.unicode.utf8Decode3(bytes) catch return length;
- if (value == 0x2028) return length; // U+2028 (LS)
- if (value == 0x2029) return length; // U+2029 (PS)
- },
- 4 => {
- _ = std.unicode.utf8Decode4(bytes) catch return length;
- },
- else => unreachable,
- }
- self.index += length - 1;
- return 0;
- }
- }
-};
-
-
-
-test "tokenizer" {
- testTokenize("test", []Token.Id {
- Token.Id.Keyword_test,
- });
-}
-
-test "tokenizer - invalid token characters" {
- testTokenize("#", []Token.Id{Token.Id.Invalid});
- testTokenize("`", []Token.Id{Token.Id.Invalid});
-}
-
-test "tokenizer - invalid literal/comment characters" {
- testTokenize("\"\x00\"", []Token.Id {
- Token.Id { .StringLiteral = Token.StrLitKind.Normal },
- Token.Id.Invalid,
- });
- testTokenize("//\x00", []Token.Id {
- Token.Id.Invalid,
- });
- testTokenize("//\x1f", []Token.Id {
- Token.Id.Invalid,
- });
- testTokenize("//\x7f", []Token.Id {
- Token.Id.Invalid,
- });
-}
-
-test "tokenizer - utf8" {
- testTokenize("//\xc2\x80", []Token.Id{});
- testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{});
-}
-
-test "tokenizer - invalid utf8" {
- testTokenize("//\x80", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xbf", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xf8", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xff", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xe0", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xf0", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid});
-}
-
-test "tokenizer - illegal unicode codepoints" {
- // unicode newline characters.U+0085, U+2028, U+2029
- testTokenize("//\xc2\x84", []Token.Id{});
- testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xc2\x86", []Token.Id{});
- testTokenize("//\xe2\x80\xa7", []Token.Id{});
- testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid});
- testTokenize("//\xe2\x80\xaa", []Token.Id{});
-}
-
-fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void {
- // (test authors, just make this bigger if you need it)
- var padded_source: [0x100]u8 = undefined;
- std.mem.copy(u8, padded_source[0..source.len], source);
- padded_source[source.len + 0] = '\n';
- padded_source[source.len + 1] = '\n';
- padded_source[source.len + 2] = '\n';
-
- var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]);
- for (expected_tokens) |expected_token_id| {
- const token = tokenizer.next();
- std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
- switch (expected_token_id) {
- Token.Id.StringLiteral => |expected_kind| {
- std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable });
- },
- else => {},
- }
- }
- std.debug.assert(tokenizer.next().id == Token.Id.Eof);
-}