aboutsummaryrefslogtreecommitdiff
path: root/src/stage1/tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/stage1/tokenizer.cpp')
-rw-r--r--src/stage1/tokenizer.cpp1846
1 files changed, 867 insertions, 979 deletions
diff --git a/src/stage1/tokenizer.cpp b/src/stage1/tokenizer.cpp
index 623169a313..4550f32e8c 100644
--- a/src/stage1/tokenizer.cpp
+++ b/src/stage1/tokenizer.cpp
@@ -30,18 +30,28 @@
case '7': \
case '8': \
case '9'
+
#define DIGIT \
'0': \
case DIGIT_NON_ZERO
-#define ALPHA \
+#define HEXDIGIT \
'a': \
case 'b': \
case 'c': \
case 'd': \
case 'e': \
case 'f': \
- case 'g': \
+ case 'A': \
+ case 'B': \
+ case 'C': \
+ case 'D': \
+ case 'E': \
+ case 'F': \
+ case DIGIT
+
+#define ALPHA_EXCEPT_HEX_P_O_X \
+ 'g': \
case 'h': \
case 'i': \
case 'j': \
@@ -49,8 +59,6 @@
case 'l': \
case 'm': \
case 'n': \
- case 'o': \
- case 'p': \
case 'q': \
case 'r': \
case 's': \
@@ -58,15 +66,8 @@
case 'u': \
case 'v': \
case 'w': \
- case 'x': \
case 'y': \
case 'z': \
- case 'A': \
- case 'B': \
- case 'C': \
- case 'D': \
- case 'E': \
- case 'F': \
case 'G': \
case 'H': \
case 'I': \
@@ -76,7 +77,6 @@
case 'M': \
case 'N': \
case 'O': \
- case 'P': \
case 'Q': \
case 'R': \
case 'S': \
@@ -88,7 +88,46 @@
case 'Y': \
case 'Z'
-#define SYMBOL_CHAR \
+#define ALPHA_EXCEPT_E_B_O_X \
+ ALPHA_EXCEPT_HEX_P_O_X: \
+ case 'a': \
+ case 'c': \
+ case 'd': \
+ case 'f': \
+ case 'A': \
+ case 'B': \
+ case 'C': \
+ case 'D': \
+ case 'F': \
+ case 'p': \
+ case 'P'
+
+#define ALPHA_EXCEPT_HEX_AND_P \
+ ALPHA_EXCEPT_HEX_P_O_X: \
+ case 'o': \
+ case 'x'
+
+#define ALPHA_EXCEPT_E \
+ ALPHA_EXCEPT_HEX_AND_P: \
+ case 'a': \
+ case 'b': \
+ case 'c': \
+ case 'd': \
+ case 'f': \
+ case 'A': \
+ case 'B': \
+ case 'C': \
+ case 'D': \
+ case 'F': \
+ case 'p': \
+ case 'P'
+
+#define ALPHA \
+ ALPHA_EXCEPT_E: \
+ case 'e': \
+ case 'E'
+
+#define IDENTIFIER_CHAR \
ALPHA: \
case DIGIT: \
case '_'
@@ -157,101 +196,92 @@ static const struct ZigKeyword zig_keywords[] = {
{"while", TokenIdKeywordWhile},
};
-bool is_zig_keyword(Buf *buf) {
+// Returns TokenIdIdentifier if it is not a keyword.
+static TokenId zig_keyword_token(const char *name_ptr, size_t name_len) {
for (size_t i = 0; i < array_length(zig_keywords); i += 1) {
- if (buf_eql_str(buf, zig_keywords[i].text)) {
- return true;
+ if (mem_eql_str(name_ptr, name_len, zig_keywords[i].text)) {
+ return zig_keywords[i].token_id;
}
}
- return false;
-}
-
-static bool is_symbol_char(uint8_t c) {
- switch (c) {
- case SYMBOL_CHAR:
- return true;
- default:
- return false;
- }
+ return TokenIdIdentifier;
}
enum TokenizeState {
- TokenizeStateStart,
- TokenizeStateSymbol,
- TokenizeStateZero, // "0", which might lead to "0x"
- TokenizeStateNumber, // "123", "0x123"
- TokenizeStateNumberNoUnderscore, // "12_", "0x12_" next char must be digit
- TokenizeStateNumberDot,
- TokenizeStateFloatFraction, // "123.456", "0x123.456"
- TokenizeStateFloatFractionNoUnderscore, // "123.45_", "0x123.45_"
- TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
- TokenizeStateFloatExponentNumber, // "123.456e7", "123.456e+7", "123.456e-7"
- TokenizeStateFloatExponentNumberNoUnderscore, // "123.456e7_", "123.456e+7_", "123.456e-7_"
- TokenizeStateString,
- TokenizeStateStringEscape,
- TokenizeStateStringEscapeUnicodeStart,
- TokenizeStateCharLiteral,
- TokenizeStateCharLiteralEnd,
- TokenizeStateCharLiteralUnicode,
- TokenizeStateSawStar,
- TokenizeStateSawStarPercent,
- TokenizeStateSawSlash,
- TokenizeStateSawSlash2,
- TokenizeStateSawSlash3,
- TokenizeStateSawSlashBang,
- TokenizeStateSawBackslash,
- TokenizeStateSawPercent,
- TokenizeStateSawPlus,
- TokenizeStateSawPlusPercent,
- TokenizeStateSawDash,
- TokenizeStateSawMinusPercent,
- TokenizeStateSawAmpersand,
- TokenizeStateSawCaret,
- TokenizeStateSawBar,
- TokenizeStateDocComment,
- TokenizeStateContainerDocComment,
- TokenizeStateLineComment,
- TokenizeStateLineString,
- TokenizeStateLineStringEnd,
- TokenizeStateLineStringContinue,
- TokenizeStateSawEq,
- TokenizeStateSawBang,
- TokenizeStateSawLessThan,
- TokenizeStateSawLessThanLessThan,
- TokenizeStateSawGreaterThan,
- TokenizeStateSawGreaterThanGreaterThan,
- TokenizeStateSawDot,
- TokenizeStateSawDotDot,
- TokenizeStateSawDotStar,
- TokenizeStateSawAtSign,
- TokenizeStateCharCode,
- TokenizeStateError,
+ TokenizeState_start,
+ TokenizeState_identifier,
+ TokenizeState_builtin,
+ TokenizeState_string_literal,
+ TokenizeState_string_literal_backslash,
+ TokenizeState_multiline_string_literal_line,
+ TokenizeState_char_literal,
+ TokenizeState_char_literal_backslash,
+ TokenizeState_char_literal_hex_escape,
+ TokenizeState_char_literal_unicode_escape_saw_u,
+ TokenizeState_char_literal_unicode_escape,
+ TokenizeState_char_literal_unicode,
+ TokenizeState_char_literal_end,
+ TokenizeState_backslash,
+ TokenizeState_equal,
+ TokenizeState_bang,
+ TokenizeState_pipe,
+ TokenizeState_minus,
+ TokenizeState_minus_percent,
+ TokenizeState_asterisk,
+ TokenizeState_asterisk_percent,
+ TokenizeState_slash,
+ TokenizeState_line_comment_start,
+ TokenizeState_line_comment,
+ TokenizeState_doc_comment_start,
+ TokenizeState_doc_comment,
+ TokenizeState_container_doc_comment,
+ TokenizeState_zero,
+ TokenizeState_int_literal_dec,
+ TokenizeState_int_literal_dec_no_underscore,
+ TokenizeState_int_literal_bin,
+ TokenizeState_int_literal_bin_no_underscore,
+ TokenizeState_int_literal_oct,
+ TokenizeState_int_literal_oct_no_underscore,
+ TokenizeState_int_literal_hex,
+ TokenizeState_int_literal_hex_no_underscore,
+ TokenizeState_num_dot_dec,
+ TokenizeState_num_dot_hex,
+ TokenizeState_float_fraction_dec,
+ TokenizeState_float_fraction_dec_no_underscore,
+ TokenizeState_float_fraction_hex,
+ TokenizeState_float_fraction_hex_no_underscore,
+ TokenizeState_float_exponent_unsigned,
+ TokenizeState_float_exponent_num,
+ TokenizeState_float_exponent_num_no_underscore,
+ TokenizeState_ampersand,
+ TokenizeState_caret,
+ TokenizeState_percent,
+ TokenizeState_plus,
+ TokenizeState_plus_percent,
+ TokenizeState_angle_bracket_left,
+ TokenizeState_angle_bracket_angle_bracket_left,
+ TokenizeState_angle_bracket_right,
+ TokenizeState_angle_bracket_angle_bracket_right,
+ TokenizeState_period,
+ TokenizeState_period_2,
+ TokenizeState_period_asterisk,
+ TokenizeState_saw_at_sign,
+ TokenizeState_error,
};
struct Tokenize {
- Buf *buf;
+ Tokenization *out;
size_t pos;
TokenizeState state;
- ZigList<Token> *tokens;
- int line;
- int column;
- Token *cur_tok;
- Tokenization *out;
- uint32_t radix;
- bool is_trailing_underscore;
- size_t char_code_index;
- bool unicode;
- uint32_t char_code;
- size_t remaining_code_units;
+ uint32_t line;
+ uint32_t column;
};
ATTRIBUTE_PRINTF(2, 3)
static void tokenize_error(Tokenize *t, const char *format, ...) {
- t->state = TokenizeStateError;
+ t->state = TokenizeState_error;
- t->out->err_line = t->line;
- t->out->err_column = t->column;
+ t->out->err_byte_offset = t->pos;
va_list ap;
va_start(ap, format);
@@ -259,98 +289,18 @@ static void tokenize_error(Tokenize *t, const char *format, ...) {
va_end(ap);
}
-static void set_token_id(Tokenize *t, Token *token, TokenId id) {
- token->id = id;
-
- if (id == TokenIdIntLiteral) {
- bigint_init_unsigned(&token->data.int_lit.bigint, 0);
- } else if (id == TokenIdFloatLiteral) {
- bigfloat_init_32(&token->data.float_lit.bigfloat, 0.0f);
- token->data.float_lit.overflow = false;
- } else if (id == TokenIdStringLiteral || id == TokenIdMultilineStringLiteral || id == TokenIdSymbol) {
- memset(&token->data.str_lit.str, 0, sizeof(Buf));
- buf_resize(&token->data.str_lit.str, 0);
- }
-}
-
static void begin_token(Tokenize *t, TokenId id) {
- assert(!t->cur_tok);
- t->tokens->add_one();
- Token *token = &t->tokens->last();
- token->start_line = t->line;
- token->start_column = t->column;
- token->start_pos = t->pos;
-
- set_token_id(t, token, id);
-
- t->cur_tok = token;
+ t->out->ids.append(id);
+ TokenLoc tok_loc;
+ tok_loc.offset = (uint32_t) t->pos;
+ tok_loc.line = t->line;
+ tok_loc.column = t->column;
+ t->out->locs.append(tok_loc);
}
static void cancel_token(Tokenize *t) {
- t->tokens->pop();
- t->cur_tok = nullptr;
-}
-
-static void end_float_token(Tokenize *t) {
- uint8_t *ptr_buf = (uint8_t*)buf_ptr(t->buf) + t->cur_tok->start_pos;
- size_t buf_len = t->cur_tok->end_pos - t->cur_tok->start_pos;
- if (bigfloat_init_buf(&t->cur_tok->data.float_lit.bigfloat, ptr_buf, buf_len)) {
- t->cur_tok->data.float_lit.overflow = true;
- }
-}
-
-static void end_token(Tokenize *t) {
- assert(t->cur_tok);
- t->cur_tok->end_pos = t->pos + 1;
-
- if (t->cur_tok->id == TokenIdFloatLiteral) {
- end_float_token(t);
- } else if (t->cur_tok->id == TokenIdSymbol) {
- char *token_mem = buf_ptr(t->buf) + t->cur_tok->start_pos;
- int token_len = (int)(t->cur_tok->end_pos - t->cur_tok->start_pos);
-
- for (size_t i = 0; i < array_length(zig_keywords); i += 1) {
- if (mem_eql_str(token_mem, token_len, zig_keywords[i].text)) {
- t->cur_tok->id = zig_keywords[i].token_id;
- break;
- }
- }
- }
-
- t->cur_tok = nullptr;
-}
-
-static bool is_exponent_signifier(uint8_t c, int radix) {
- if (radix == 16) {
- return c == 'p' || c == 'P';
- } else {
- return c == 'e' || c == 'E';
- }
-}
-
-static uint32_t get_digit_value(uint8_t c) {
- if ('0' <= c && c <= '9') {
- return c - '0';
- }
- if ('A' <= c && c <= 'Z') {
- return c - 'A' + 10;
- }
- if ('a' <= c && c <= 'z') {
- return c - 'a' + 10;
- }
- return UINT32_MAX;
-}
-
-static void handle_string_escape(Tokenize *t, uint8_t c) {
- if (t->cur_tok->id == TokenIdCharLiteral) {
- t->cur_tok->data.char_lit.c = c;
- t->state = TokenizeStateCharLiteralEnd;
- } else if (t->cur_tok->id == TokenIdStringLiteral || t->cur_tok->id == TokenIdSymbol) {
- buf_append_char(&t->cur_tok->data.str_lit.str, c);
- t->state = TokenizeStateString;
- } else {
- zig_unreachable();
- }
+ t->out->ids.pop();
+ t->out->locs.pop();
}
static const char* get_escape_shorthand(uint8_t c) {
@@ -376,7 +326,15 @@ static const char* get_escape_shorthand(uint8_t c) {
}
}
+static void invalid_eof(Tokenize *t) {
+ return tokenize_error(t, "unexpected End-Of-File");
+}
+
static void invalid_char_error(Tokenize *t, uint8_t c) {
+ if (c == 0) {
+ return invalid_eof(t);
+ }
+
if (c == '\r') {
tokenize_error(t, "invalid carriage return, only '\\n' line endings are supported");
return;
@@ -396,1139 +354,1089 @@ static void invalid_char_error(Tokenize *t, uint8_t c) {
tokenize_error(t, "invalid character: '\\x%02x'", c);
}
-void tokenize(Buf *buf, Tokenization *out) {
+void tokenize(const char *source, Tokenization *out) {
Tokenize t = {0};
t.out = out;
- t.tokens = out->tokens = heap::c_allocator.create<ZigList<Token>>();
- t.buf = buf;
- out->line_offsets = heap::c_allocator.create<ZigList<size_t>>();
- out->line_offsets->append(0);
+ size_t remaining_code_units;
+ size_t seen_escape_digits;
- // Skip the UTF-8 BOM if present
- if (buf_starts_with_mem(buf, "\xEF\xBB\xBF", 3)) {
+ // Skip the UTF-8 BOM if present.
+ if (source[0] == (char)0xef &&
+ source[1] == (char)0xbb &&
+ source[2] == (char)0xbf)
+ {
t.pos += 3;
}
- for (; t.pos < buf_len(t.buf); t.pos += 1) {
- uint8_t c = buf_ptr(t.buf)[t.pos];
+ // Invalid token takes up index 0 so that index 0 can mean "none".
+ begin_token(&t, TokenIdCount);
+
+ for (;;) {
+ uint8_t c = source[t.pos];
switch (t.state) {
- case TokenizeStateError:
- break;
- case TokenizeStateStart:
+ case TokenizeState_error:
+ goto eof;
+ case TokenizeState_start:
switch (c) {
+ case 0:
+ goto eof;
case WHITESPACE:
break;
+ case '"':
+ begin_token(&t, TokenIdStringLiteral);
+ t.state = TokenizeState_string_literal;
+ break;
+ case '\'':
+ begin_token(&t, TokenIdCharLiteral);
+ t.state = TokenizeState_char_literal;
+ break;
case ALPHA:
case '_':
- t.state = TokenizeStateSymbol;
- begin_token(&t, TokenIdSymbol);
- buf_append_char(&t.cur_tok->data.str_lit.str, c);
+ t.state = TokenizeState_identifier;
+ begin_token(&t, TokenIdIdentifier);
break;
- case '0':
- t.state = TokenizeStateZero;
- begin_token(&t, TokenIdIntLiteral);
- t.is_trailing_underscore = false;
- t.radix = 10;
- bigint_init_unsigned(&t.cur_tok->data.int_lit.bigint, 0);
+ case '@':
+ begin_token(&t, TokenIdBuiltin);
+ t.state = TokenizeState_saw_at_sign;
break;
- case DIGIT_NON_ZERO:
- t.state = TokenizeStateNumber;
- begin_token(&t, TokenIdIntLiteral);
- t.is_trailing_underscore = false;
- t.radix = 10;
- bigint_init_unsigned(&t.cur_tok->data.int_lit.bigint, get_digit_value(c));
+ case '=':
+ begin_token(&t, TokenIdEq);
+ t.state = TokenizeState_equal;
break;
- case '"':
- begin_token(&t, TokenIdStringLiteral);
- t.state = TokenizeStateString;
+ case '!':
+ begin_token(&t, TokenIdBang);
+ t.state = TokenizeState_bang;
break;
- case '\'':
- begin_token(&t, TokenIdCharLiteral);
- t.state = TokenizeStateCharLiteral;
+ case '|':
+ begin_token(&t, TokenIdBinOr);
+ t.state = TokenizeState_pipe;
break;
case '(':
begin_token(&t, TokenIdLParen);
- end_token(&t);
break;
case ')':
begin_token(&t, TokenIdRParen);
- end_token(&t);
- break;
- case ',':
- begin_token(&t, TokenIdComma);
- end_token(&t);
- break;
- case '?':
- begin_token(&t, TokenIdQuestion);
- end_token(&t);
- break;
- case '{':
- begin_token(&t, TokenIdLBrace);
- end_token(&t);
- break;
- case '}':
- begin_token(&t, TokenIdRBrace);
- end_token(&t);
break;
case '[':
begin_token(&t, TokenIdLBracket);
- end_token(&t);
break;
case ']':
begin_token(&t, TokenIdRBracket);
- end_token(&t);
break;
case ';':
begin_token(&t, TokenIdSemicolon);
- end_token(&t);
+ break;
+ case ',':
+ begin_token(&t, TokenIdComma);
+ break;
+ case '?':
+ begin_token(&t, TokenIdQuestion);
break;
case ':':
begin_token(&t, TokenIdColon);
- end_token(&t);
break;
- case '#':
- begin_token(&t, TokenIdNumberSign);
- end_token(&t);
+ case '%':
+ begin_token(&t, TokenIdPercent);
+ t.state = TokenizeState_percent;
break;
case '*':
begin_token(&t, TokenIdStar);
- t.state = TokenizeStateSawStar;
+ t.state = TokenizeState_asterisk;
break;
- case '/':
- begin_token(&t, TokenIdSlash);
- t.state = TokenizeStateSawSlash;
+ case '+':
+ begin_token(&t, TokenIdPlus);
+ t.state = TokenizeState_plus;
+ break;
+ case '<':
+ begin_token(&t, TokenIdCmpLessThan);
+ t.state = TokenizeState_angle_bracket_left;
+ break;
+ case '>':
+ begin_token(&t, TokenIdCmpGreaterThan);
+ t.state = TokenizeState_angle_bracket_right;
+ break;
+ case '^':
+ begin_token(&t, TokenIdBinXor);
+ t.state = TokenizeState_caret;
break;
case '\\':
- begin_token(&t, TokenIdMultilineStringLiteral);
- t.state = TokenizeStateSawBackslash;
+ begin_token(&t, TokenIdMultilineStringLiteralLine);
+ t.state = TokenizeState_backslash;
break;
- case '%':
- begin_token(&t, TokenIdPercent);
- t.state = TokenizeStateSawPercent;
+ case '{':
+ begin_token(&t, TokenIdLBrace);
break;
- case '+':
- begin_token(&t, TokenIdPlus);
- t.state = TokenizeStateSawPlus;
+ case '}':
+ begin_token(&t, TokenIdRBrace);
break;
case '~':
begin_token(&t, TokenIdTilde);
- end_token(&t);
break;
- case '@':
- begin_token(&t, TokenIdAtSign);
- t.state = TokenizeStateSawAtSign;
+ case '.':
+ begin_token(&t, TokenIdDot);
+ t.state = TokenizeState_period;
break;
case '-':
begin_token(&t, TokenIdDash);
- t.state = TokenizeStateSawDash;
+ t.state = TokenizeState_minus;
+ break;
+ case '/':
+ begin_token(&t, TokenIdSlash);
+ t.state = TokenizeState_slash;
break;
case '&':
begin_token(&t, TokenIdAmpersand);
- t.state = TokenizeStateSawAmpersand;
+ t.state = TokenizeState_ampersand;
break;
- case '^':
- begin_token(&t, TokenIdBinXor);
- t.state = TokenizeStateSawCaret;
- break;
- case '|':
- begin_token(&t, TokenIdBinOr);
- t.state = TokenizeStateSawBar;
+ case '0':
+ t.state = TokenizeState_zero;
+ begin_token(&t, TokenIdIntLiteral);
break;
- case '=':
- begin_token(&t, TokenIdEq);
- t.state = TokenizeStateSawEq;
+ case DIGIT_NON_ZERO:
+ t.state = TokenizeState_int_literal_dec;
+ begin_token(&t, TokenIdIntLiteral);
break;
- case '!':
- begin_token(&t, TokenIdBang);
- t.state = TokenizeStateSawBang;
+ default:
+ invalid_char_error(&t, c);
+ }
+ break;
+ case TokenizeState_saw_at_sign:
+ switch (c) {
+ case 0:
+ invalid_eof(&t);
+ goto eof;
+ case '"':
+ t.out->ids.last() = TokenIdIdentifier;
+ t.state = TokenizeState_string_literal;
break;
- case '<':
- begin_token(&t, TokenIdCmpLessThan);
- t.state = TokenizeStateSawLessThan;
+ case IDENTIFIER_CHAR:
+ t.state = TokenizeState_builtin;
break;
- case '>':
- begin_token(&t, TokenIdCmpGreaterThan);
- t.state = TokenizeStateSawGreaterThan;
+ default:
+ invalid_char_error(&t, c);
+ }
+ break;
+ case TokenizeState_ampersand:
+ switch (c) {
+ case 0:
+ goto eof;
+ case '&':
+ tokenize_error(&t, "`&&` is invalid. Note that `and` is boolean AND");
break;
- case '.':
- begin_token(&t, TokenIdDot);
- t.state = TokenizeStateSawDot;
+ case '=':
+ t.out->ids.last() = TokenIdBitAndEq;
+ t.state = TokenizeState_start;
break;
default:
- invalid_char_error(&t, c);
+ t.state = TokenizeState_start;
+ continue;
}
break;
- case TokenizeStateSawDot:
+ case TokenizeState_asterisk:
switch (c) {
- case '.':
- t.state = TokenizeStateSawDotDot;
- set_token_id(&t, t.cur_tok, TokenIdEllipsis2);
+ case 0:
+ goto eof;
+ case '=':
+ t.out->ids.last() = TokenIdTimesEq;
+ t.state = TokenizeState_start;
break;
case '*':
- t.state = TokenizeStateSawDotStar;
- set_token_id(&t, t.cur_tok, TokenIdDotStar);
+ t.out->ids.last() = TokenIdStarStar;
+ t.state = TokenizeState_start;
+ break;
+ case '%':
+ t.state = TokenizeState_asterisk_percent;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawDotDot:
+ case TokenizeState_asterisk_percent:
switch (c) {
- case '.':
- t.state = TokenizeStateStart;
- set_token_id(&t, t.cur_tok, TokenIdEllipsis3);
- end_token(&t);
+ case 0:
+ t.out->ids.last() = TokenIdTimesPercent;
+ goto eof;
+ case '=':
+ t.out->ids.last() = TokenIdTimesPercentEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdTimesPercent;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawDotStar:
+ case TokenizeState_percent:
switch (c) {
- case '*':
- tokenize_error(&t, "`.*` can't be followed by `*`. Are you missing a space?");
+ case 0:
+ goto eof;
+ case '=':
+ t.out->ids.last() = TokenIdModEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawGreaterThan:
+ case TokenizeState_plus:
switch (c) {
+ case 0:
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdCmpGreaterOrEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdPlusEq;
+ t.state = TokenizeState_start;
break;
- case '>':
- set_token_id(&t, t.cur_tok, TokenIdBitShiftRight);
- t.state = TokenizeStateSawGreaterThanGreaterThan;
+ case '+':
+ t.out->ids.last() = TokenIdPlusPlus;
+ t.state = TokenizeState_start;
+ break;
+ case '%':
+ t.state = TokenizeState_plus_percent;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawGreaterThanGreaterThan:
+ case TokenizeState_plus_percent:
switch (c) {
+ case 0:
+ t.out->ids.last() = TokenIdPlusPercent;
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdBitShiftRightEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdPlusPercentEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdPlusPercent;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawLessThan:
+ case TokenizeState_caret:
switch (c) {
+ case 0:
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdCmpLessOrEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdBitXorEq;
+ t.state = TokenizeState_start;
break;
- case '<':
- set_token_id(&t, t.cur_tok, TokenIdBitShiftLeft);
- t.state = TokenizeStateSawLessThanLessThan;
+ default:
+ t.state = TokenizeState_start;
+ continue;
+ }
+ break;
+ case TokenizeState_identifier:
+ switch (c) {
+ case 0: {
+ uint32_t start_pos = t.out->locs.last().offset;
+ t.out->ids.last() = zig_keyword_token(
+ source + start_pos, t.pos - start_pos);
+ goto eof;
+ }
+ case IDENTIFIER_CHAR:
+ break;
+ default: {
+ uint32_t start_pos = t.out->locs.last().offset;
+ t.out->ids.last() = zig_keyword_token(
+ source + start_pos, t.pos - start_pos);
+
+ t.state = TokenizeState_start;
+ continue;
+ }
+ }
+ break;
+ case TokenizeState_builtin:
+ switch (c) {
+ case 0:
+ goto eof;
+ case IDENTIFIER_CHAR:
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawLessThanLessThan:
+ case TokenizeState_backslash:
+ switch (c) {
+ case '\\':
+ t.state = TokenizeState_multiline_string_literal_line;
+ break;
+ default:
+ invalid_char_error(&t, c);
+ break;
+ }
+ break;
+ case TokenizeState_string_literal:
+ switch (c) {
+ case 0:
+ invalid_eof(&t);
+ goto eof;
+ case '\\':
+ t.state = TokenizeState_string_literal_backslash;
+ break;
+ case '"':
+ t.state = TokenizeState_start;
+ break;
+ case '\n':
+ case '\r':
+ tokenize_error(&t, "newline not allowed in string literal");
+ break;
+ default:
+ break;
+ }
+ break;
+ case TokenizeState_string_literal_backslash:
+ switch (c) {
+ case 0:
+ invalid_eof(&t);
+ goto eof;
+ case '\n':
+ case '\r':
+ tokenize_error(&t, "newline not allowed in string literal");
+ break;
+ default:
+ t.state = TokenizeState_string_literal;
+ break;
+ }
+ break;
+ case TokenizeState_char_literal:
+ if (c == 0) {
+ invalid_eof(&t);
+ goto eof;
+ } else if (c == '\\') {
+ t.state = TokenizeState_char_literal_backslash;
+ } else if (c == '\'') {
+ tokenize_error(&t, "expected character");
+ } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
+ // 10xxxxxx
+ // 11111xxx
+ invalid_char_error(&t, c);
+ } else if (c >= 0xc0 && c <= 0xdf) {
+ // 110xxxxx
+ remaining_code_units = 1;
+ t.state = TokenizeState_char_literal_unicode;
+ } else if (c >= 0xe0 && c <= 0xef) {
+ // 1110xxxx
+ remaining_code_units = 2;
+ t.state = TokenizeState_char_literal_unicode;
+ } else if (c >= 0xf0 && c <= 0xf7) {
+ // 11110xxx
+ remaining_code_units = 3;
+ t.state = TokenizeState_char_literal_unicode;
+ } else {
+ t.state = TokenizeState_char_literal_end;
+ }
+ break;
+ case TokenizeState_char_literal_backslash:
+ switch (c) {
+ case 0:
+ invalid_eof(&t);
+ goto eof;
+ case '\n':
+ case '\r':
+ tokenize_error(&t, "newline not allowed in character literal");
+ break;
+ case 'x':
+ t.state = TokenizeState_char_literal_hex_escape;
+ seen_escape_digits = 0;
+ break;
+ case 'u':
+ t.state = TokenizeState_char_literal_unicode_escape_saw_u;
+ break;
+ case 'U':
+ invalid_char_error(&t, c);
+ break;
+ default:
+ t.state = TokenizeState_char_literal_end;
+ break;
+ }
+ break;
+ case TokenizeState_char_literal_hex_escape:
+ switch (c) {
+ case ALPHA:
+ case DIGIT:
+ seen_escape_digits += 1;
+ if (seen_escape_digits == 2) {
+ t.state = TokenizeState_char_literal_end;
+ }
+ break;
+ default:
+ tokenize_error(&t, "expected hex digit");
+ break;
+ }
+ break;
+ case TokenizeState_char_literal_unicode_escape_saw_u:
+ switch (c) {
+ case '{':
+ t.state = TokenizeState_char_literal_unicode_escape;
+ seen_escape_digits = 0;
+ break;
+ default:
+ tokenize_error(&t, "expected '{' to begin unicode escape sequence");
+ break;
+ }
+ break;
+ case TokenizeState_char_literal_unicode_escape:
switch (c) {
+ case ALPHA:
+ case DIGIT:
+ seen_escape_digits += 1;
+ break;
+ case '}':
+ if (seen_escape_digits == 0) {
+ tokenize_error(&t, "empty unicode escape sequence");
+ break;
+ }
+ t.state = TokenizeState_char_literal_end;
+ break;
+ default:
+ tokenize_error(&t, "expected hex digit");
+ break;
+ }
+ break;
+ case TokenizeState_char_literal_end:
+ switch (c) {
+ case '\'':
+ t.state = TokenizeState_start;
+ break;
+ default:
+ invalid_char_error(&t, c);
+ break;
+ }
+ break;
+ case TokenizeState_char_literal_unicode:
+ if (c >= 0x80 && c <= 0xbf) {
+ remaining_code_units -= 1;
+ if (remaining_code_units == 0) {
+ t.state = TokenizeState_char_literal_end;
+ }
+ } else {
+ invalid_char_error(&t, c);
+ }
+ break;
+ case TokenizeState_multiline_string_literal_line:
+ switch (c) {
+ case 0:
+ goto eof;
+ case '\n':
+ t.state = TokenizeState_start;
+ break;
+ default:
+ break;
+ }
+ break;
+ case TokenizeState_bang:
+ switch (c) {
+ case 0:
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdBitShiftLeftEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdCmpNotEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawBang:
+ case TokenizeState_pipe:
switch (c) {
+ case 0:
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdCmpNotEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdBitOrEq;
+ t.state = TokenizeState_start;
+ break;
+ case '|':
+ t.out->ids.last() = TokenIdBarBar;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawEq:
+ case TokenizeState_equal:
switch (c) {
+ case 0:
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdCmpEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdCmpEq;
+ t.state = TokenizeState_start;
break;
case '>':
- set_token_id(&t, t.cur_tok, TokenIdFatArrow);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdFatArrow;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawStar:
+ case TokenizeState_minus:
switch (c) {
- case '=':
- set_token_id(&t, t.cur_tok, TokenIdTimesEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case 0:
+ goto eof;
+ case '>':
+ t.out->ids.last() = TokenIdArrow;
+ t.state = TokenizeState_start;
break;
- case '*':
- set_token_id(&t, t.cur_tok, TokenIdStarStar);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case '=':
+ t.out->ids.last() = TokenIdMinusEq;
+ t.state = TokenizeState_start;
break;
case '%':
- set_token_id(&t, t.cur_tok, TokenIdTimesPercent);
- t.state = TokenizeStateSawStarPercent;
+ t.state = TokenizeState_minus_percent;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawStarPercent:
+ case TokenizeState_minus_percent:
switch (c) {
+ case 0:
+ t.out->ids.last() = TokenIdMinusPercent;
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdTimesPercentEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdMinusPercentEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdMinusPercent;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawPercent:
+ case TokenizeState_angle_bracket_left:
switch (c) {
+ case 0:
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdModEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdCmpLessOrEq;
+ t.state = TokenizeState_start;
break;
- case '.':
- set_token_id(&t, t.cur_tok, TokenIdPercentDot);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case '<':
+ t.state = TokenizeState_angle_bracket_angle_bracket_left;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawPlus:
+ case TokenizeState_angle_bracket_angle_bracket_left:
switch (c) {
+ case 0:
+ t.out->ids.last() = TokenIdBitShiftLeft;
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdPlusEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdBitShiftLeftEq;
+ t.state = TokenizeState_start;
break;
- case '+':
- set_token_id(&t, t.cur_tok, TokenIdPlusPlus);
- end_token(&t);
- t.state = TokenizeStateStart;
+ default:
+ t.out->ids.last() = TokenIdBitShiftLeft;
+ t.state = TokenizeState_start;
+ continue;
+ }
+ break;
+ case TokenizeState_angle_bracket_right:
+ switch (c) {
+ case 0:
+ goto eof;
+ case '=':
+ t.out->ids.last() = TokenIdCmpGreaterOrEq;
+ t.state = TokenizeState_start;
break;
- case '%':
- set_token_id(&t, t.cur_tok, TokenIdPlusPercent);
- t.state = TokenizeStateSawPlusPercent;
+ case '>':
+ t.state = TokenizeState_angle_bracket_angle_bracket_right;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawPlusPercent:
+ case TokenizeState_angle_bracket_angle_bracket_right:
switch (c) {
+ case 0:
+ t.out->ids.last() = TokenIdBitShiftRight;
+ goto eof;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdPlusPercentEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdBitShiftRightEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdBitShiftRight;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawAmpersand:
+ case TokenizeState_period:
switch (c) {
- case '&':
- tokenize_error(&t, "`&&` is invalid. Note that `and` is boolean AND");
+ case 0:
+ goto eof;
+ case '.':
+ t.state = TokenizeState_period_2;
break;
- case '=':
- set_token_id(&t, t.cur_tok, TokenIdBitAndEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case '*':
+ t.state = TokenizeState_period_asterisk;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawCaret:
+ case TokenizeState_period_2:
switch (c) {
- case '=':
- set_token_id(&t, t.cur_tok, TokenIdBitXorEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case 0:
+ t.out->ids.last() = TokenIdEllipsis2;
+ goto eof;
+ case '.':
+ t.out->ids.last() = TokenIdEllipsis3;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdEllipsis2;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawBar:
+ case TokenizeState_period_asterisk:
switch (c) {
- case '=':
- set_token_id(&t, t.cur_tok, TokenIdBitOrEq);
- end_token(&t);
- t.state = TokenizeStateStart;
- break;
- case '|':
- set_token_id(&t, t.cur_tok, TokenIdBarBar);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case 0:
+ t.out->ids.last() = TokenIdDotStar;
+ goto eof;
+ case '*':
+ tokenize_error(&t, "`.*` cannot be followed by `*`. Are you missing a space?");
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdDotStar;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawSlash:
+ case TokenizeState_slash:
switch (c) {
+ case 0:
+ goto eof;
case '/':
- t.state = TokenizeStateSawSlash2;
+ t.state = TokenizeState_line_comment_start;
break;
case '=':
- set_token_id(&t, t.cur_tok, TokenIdDivEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdDivEq;
+ t.state = TokenizeState_start;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSawSlash2:
+ case TokenizeState_line_comment_start:
switch (c) {
+ case 0:
+ goto eof;
case '/':
- t.state = TokenizeStateSawSlash3;
+ t.state = TokenizeState_doc_comment_start;
break;
case '!':
- t.state = TokenizeStateSawSlashBang;
+ t.out->ids.last() = TokenIdContainerDocComment;
+ t.state = TokenizeState_container_doc_comment;
break;
case '\n':
cancel_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
break;
default:
cancel_token(&t);
- t.state = TokenizeStateLineComment;
+ t.state = TokenizeState_line_comment;
break;
}
break;
- case TokenizeStateSawSlash3:
+ case TokenizeState_doc_comment_start:
switch (c) {
+ case 0:
+ t.out->ids.last() = TokenIdDocComment;
+ goto eof;
case '/':
cancel_token(&t);
- t.state = TokenizeStateLineComment;
+ t.state = TokenizeState_line_comment;
break;
case '\n':
- set_token_id(&t, t.cur_tok, TokenIdDocComment);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.out->ids.last() = TokenIdDocComment;
+ t.state = TokenizeState_start;
break;
default:
- set_token_id(&t, t.cur_tok, TokenIdDocComment);
- t.state = TokenizeStateDocComment;
+ t.out->ids.last() = TokenIdDocComment;
+ t.state = TokenizeState_doc_comment;
break;
}
break;
- case TokenizeStateSawSlashBang:
+ case TokenizeState_line_comment:
switch (c) {
+ case 0:
+ goto eof;
case '\n':
- set_token_id(&t, t.cur_tok, TokenIdContainerDocComment);
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
break;
default:
- set_token_id(&t, t.cur_tok, TokenIdContainerDocComment);
- t.state = TokenizeStateContainerDocComment;
break;
}
break;
- case TokenizeStateSawBackslash:
+ case TokenizeState_doc_comment:
+ case TokenizeState_container_doc_comment:
switch (c) {
- case '\\':
- t.state = TokenizeStateLineString;
+ case 0:
+ goto eof;
+ case '\n':
+ t.state = TokenizeState_start;
break;
default:
- invalid_char_error(&t, c);
+ // do nothing
break;
}
break;
- case TokenizeStateLineString:
+ case TokenizeState_zero:
switch (c) {
- case '\n':
- t.state = TokenizeStateLineStringEnd;
+ case 0:
+ goto eof;
+ case 'b':
+ t.state = TokenizeState_int_literal_bin_no_underscore;
break;
- default:
- buf_append_char(&t.cur_tok->data.str_lit.str, c);
+ case 'o':
+ t.state = TokenizeState_int_literal_oct_no_underscore;
break;
- }
- break;
- case TokenizeStateLineStringEnd:
- switch (c) {
- case WHITESPACE:
+ case 'x':
+ t.state = TokenizeState_int_literal_hex_no_underscore;
break;
- case '\\':
- t.state = TokenizeStateLineStringContinue;
+ case DIGIT:
+ case '_':
+ case '.':
+ case 'e':
+ case 'E':
+ // Reinterpret as a decimal number.
+ t.state = TokenizeState_int_literal_dec;
+ continue;
+ case ALPHA_EXCEPT_E_B_O_X:
+ invalid_char_error(&t, c);
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateLineStringContinue:
+ case TokenizeState_int_literal_bin_no_underscore:
switch (c) {
- case '\\':
- t.state = TokenizeStateLineString;
- buf_append_char(&t.cur_tok->data.str_lit.str, '\n');
+ case '0':
+ case '1':
+ t.state = TokenizeState_int_literal_bin;
break;
default:
invalid_char_error(&t, c);
- break;
}
break;
- case TokenizeStateLineComment:
+ case TokenizeState_int_literal_bin:
switch (c) {
- case '\n':
- t.state = TokenizeStateStart;
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_int_literal_bin_no_underscore;
break;
- default:
- // do nothing
+ case '0':
+ case '1':
+ break;
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case ALPHA:
+ invalid_char_error(&t, c);
break;
+ default:
+ t.state = TokenizeState_start;
+ continue;
}
break;
- case TokenizeStateDocComment:
+ case TokenizeState_int_literal_oct_no_underscore:
switch (c) {
- case '\n':
- end_token(&t);
- t.state = TokenizeStateStart;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ t.state = TokenizeState_int_literal_oct;
break;
default:
- // do nothing
+ invalid_char_error(&t, c);
break;
}
break;
- case TokenizeStateContainerDocComment:
+ case TokenizeState_int_literal_oct:
switch (c) {
- case '\n':
- end_token(&t);
- t.state = TokenizeStateStart;
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_int_literal_oct_no_underscore;
break;
- default:
- // do nothing
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
break;
- }
- break;
- case TokenizeStateSawAtSign:
- switch (c) {
- case '"':
- set_token_id(&t, t.cur_tok, TokenIdSymbol);
- t.state = TokenizeStateString;
+ case ALPHA:
+ case '8':
+ case '9':
+ invalid_char_error(&t, c);
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateSymbol:
+ case TokenizeState_int_literal_dec_no_underscore:
switch (c) {
- case SYMBOL_CHAR:
- buf_append_char(&t.cur_tok->data.str_lit.str, c);
+ case DIGIT:
+ t.state = TokenizeState_int_literal_dec;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
- continue;
+ invalid_char_error(&t, c);
+ break;
}
break;
- case TokenizeStateString:
+ case TokenizeState_int_literal_dec:
switch (c) {
- case '"':
- end_token(&t);
- t.state = TokenizeStateStart;
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_int_literal_dec_no_underscore;
break;
- case '\n':
- tokenize_error(&t, "newline not allowed in string literal");
+ case '.':
+ t.state = TokenizeState_num_dot_dec;
+ t.out->ids.last() = TokenIdFloatLiteral;
break;
- case '\\':
- t.state = TokenizeStateStringEscape;
+ case 'e':
+ case 'E':
+ t.state = TokenizeState_float_exponent_unsigned;
+ t.out->ids.last() = TokenIdFloatLiteral;
break;
- default:
- buf_append_char(&t.cur_tok->data.str_lit.str, c);
+ case DIGIT:
break;
+ case ALPHA_EXCEPT_E:
+ invalid_char_error(&t, c);
+ break;
+ default:
+ t.state = TokenizeState_start;
+ continue;
}
break;
- case TokenizeStateStringEscape:
+ case TokenizeState_int_literal_hex_no_underscore:
switch (c) {
- case 'x':
- t.state = TokenizeStateCharCode;
- t.radix = 16;
- t.char_code = 0;
- t.char_code_index = 0;
- t.unicode = false;
- break;
- case 'u':
- t.state = TokenizeStateStringEscapeUnicodeStart;
+ case HEXDIGIT:
+ t.state = TokenizeState_int_literal_hex;
break;
- case 'n':
- handle_string_escape(&t, '\n');
- break;
- case 'r':
- handle_string_escape(&t, '\r');
+ default:
+ invalid_char_error(&t, c);
+ }
+ break;
+ case TokenizeState_int_literal_hex:
+ switch (c) {
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_int_literal_hex_no_underscore;
break;
- case '\\':
- handle_string_escape(&t, '\\');
+ case '.':
+ t.state = TokenizeState_num_dot_hex;
+ t.out->ids.last() = TokenIdFloatLiteral;
break;
- case 't':
- handle_string_escape(&t, '\t');
+ case 'p':
+ case 'P':
+ t.state = TokenizeState_float_exponent_unsigned;
+ t.out->ids.last() = TokenIdFloatLiteral;
break;
- case '\'':
- handle_string_escape(&t, '\'');
+ case HEXDIGIT:
break;
- case '"':
- handle_string_escape(&t, '\"');
+ case ALPHA_EXCEPT_HEX_AND_P:
+ invalid_char_error(&t, c);
break;
default:
- invalid_char_error(&t, c);
+ t.state = TokenizeState_start;
+ continue;
}
break;
- case TokenizeStateStringEscapeUnicodeStart:
+ case TokenizeState_num_dot_dec:
switch (c) {
- case '{':
- t.state = TokenizeStateCharCode;
- t.radix = 16;
- t.char_code = 0;
- t.char_code_index = 0;
- t.unicode = true;
+ case 0:
+ goto eof;
+ case '.':
+ t.out->ids.last() = TokenIdIntLiteral;
+ t.pos -= 1;
+ t.column -= 1;
+ t.state = TokenizeState_start;
+ continue;
+ case 'e':
+ case 'E':
+ t.state = TokenizeState_float_exponent_unsigned;
+ break;
+ case DIGIT:
+ t.state = TokenizeState_float_fraction_dec;
break;
default:
invalid_char_error(&t, c);
+ break;
}
break;
- case TokenizeStateCharCode:
- {
- if (t.unicode && c == '}') {
- if (t.char_code_index == 0) {
- tokenize_error(&t, "empty unicode escape sequence");
- break;
- }
- if (t.char_code > 0x10ffff) {
- tokenize_error(&t, "unicode value out of range: %x", t.char_code);
- break;
- }
- if (t.cur_tok->id == TokenIdCharLiteral) {
- t.cur_tok->data.char_lit.c = t.char_code;
- t.state = TokenizeStateCharLiteralEnd;
- } else if (t.char_code <= 0x7f) {
- // 00000000 00000000 00000000 0xxxxxxx
- handle_string_escape(&t, (uint8_t)t.char_code);
- } else if (t.char_code <= 0x7ff) {
- // 00000000 00000000 00000xxx xx000000
- handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
- // 00000000 00000000 00000000 00xxxxxx
- handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- } else if (t.char_code <= 0xffff) {
- // 00000000 00000000 xxxx0000 00000000
- handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
- // 00000000 00000000 0000xxxx xx000000
- handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
- // 00000000 00000000 00000000 00xxxxxx
- handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- } else if (t.char_code <= 0x10ffff) {
- // 00000000 000xxx00 00000000 00000000
- handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
- // 00000000 000000xx xxxx0000 00000000
- handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
- // 00000000 00000000 0000xxxx xx000000
- handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
- // 00000000 00000000 00000000 00xxxxxx
- handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- } else {
- zig_unreachable();
- }
+ case TokenizeState_num_dot_hex:
+ switch (c) {
+ case 0:
+ goto eof;
+ case '.':
+ t.out->ids.last() = TokenIdIntLiteral;
+ t.pos -= 1;
+ t.column -= 1;
+ t.state = TokenizeState_start;
+ continue;
+ case 'p':
+ case 'P':
+ t.state = TokenizeState_float_exponent_unsigned;
break;
- }
-
- uint32_t digit_value = get_digit_value(c);
- if (digit_value >= t.radix) {
- tokenize_error(&t, "invalid digit: '%c'", c);
+ case HEXDIGIT:
+ t.out->ids.last() = TokenIdFloatLiteral;
+ t.state = TokenizeState_float_fraction_hex;
+ break;
+ default:
+ invalid_char_error(&t, c);
break;
- }
- t.char_code *= t.radix;
- t.char_code += digit_value;
- t.char_code_index += 1;
-
- if (!t.unicode && t.char_code_index >= 2) {
- assert(t.char_code <= 255);
- handle_string_escape(&t, (uint8_t)t.char_code);
- }
- }
- break;
- case TokenizeStateCharLiteral:
- if (c == '\'') {
- tokenize_error(&t, "expected character");
- } else if (c == '\\') {
- t.state = TokenizeStateStringEscape;
- } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
- // 10xxxxxx
- // 11111xxx
- invalid_char_error(&t, c);
- } else if (c >= 0xc0 && c <= 0xdf) {
- // 110xxxxx
- t.cur_tok->data.char_lit.c = c & 0x1f;
- t.remaining_code_units = 1;
- t.state = TokenizeStateCharLiteralUnicode;
- } else if (c >= 0xe0 && c <= 0xef) {
- // 1110xxxx
- t.cur_tok->data.char_lit.c = c & 0x0f;
- t.remaining_code_units = 2;
- t.state = TokenizeStateCharLiteralUnicode;
- } else if (c >= 0xf0 && c <= 0xf7) {
- // 11110xxx
- t.cur_tok->data.char_lit.c = c & 0x07;
- t.remaining_code_units = 3;
- t.state = TokenizeStateCharLiteralUnicode;
- } else {
- t.cur_tok->data.char_lit.c = c;
- t.state = TokenizeStateCharLiteralEnd;
}
break;
- case TokenizeStateCharLiteralEnd:
+ case TokenizeState_float_fraction_dec_no_underscore:
switch (c) {
- case '\'':
- end_token(&t);
- t.state = TokenizeStateStart;
+ case DIGIT:
+ t.state = TokenizeState_float_fraction_dec;
break;
default:
invalid_char_error(&t, c);
}
break;
- case TokenizeStateCharLiteralUnicode:
- if (c <= 0x7f || c >= 0xc0) {
- invalid_char_error(&t, c);
- }
- t.cur_tok->data.char_lit.c <<= 6;
- t.cur_tok->data.char_lit.c += c & 0x3f;
- t.remaining_code_units--;
- if (t.remaining_code_units == 0) {
- t.state = TokenizeStateCharLiteralEnd;
- }
- break;
- case TokenizeStateZero:
+ case TokenizeState_float_fraction_dec:
switch (c) {
- case 'b':
- t.radix = 2;
- t.state = TokenizeStateNumberNoUnderscore;
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_float_fraction_dec_no_underscore;
break;
- case 'o':
- t.radix = 8;
- t.state = TokenizeStateNumberNoUnderscore;
+ case 'e':
+ case 'E':
+ t.state = TokenizeState_float_exponent_unsigned;
break;
- case 'x':
- t.radix = 16;
- t.state = TokenizeStateNumberNoUnderscore;
+ case DIGIT:
+ break;
+ case ALPHA_EXCEPT_E:
+ invalid_char_error(&t, c);
break;
default:
- // reinterpret as normal number
- t.pos -= 1;
- t.state = TokenizeStateNumber;
+ t.state = TokenizeState_start;
continue;
}
break;
- case TokenizeStateNumberNoUnderscore:
- if (c == '_') {
- invalid_char_error(&t, c);
- break;
- } else if (get_digit_value(c) < t.radix) {
- t.is_trailing_underscore = false;
- t.state = TokenizeStateNumber;
+ case TokenizeState_float_fraction_hex_no_underscore:
+ switch (c) {
+ case HEXDIGIT:
+ t.state = TokenizeState_float_fraction_hex;
+ break;
+ default:
+ invalid_char_error(&t, c);
}
- ZIG_FALLTHROUGH;
- case TokenizeStateNumber:
- {
- if (c == '_') {
- t.is_trailing_underscore = true;
- t.state = TokenizeStateNumberNoUnderscore;
+ break;
+ case TokenizeState_float_fraction_hex:
+ switch (c) {
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_float_fraction_hex_no_underscore;
break;
- }
- if (c == '.') {
- if (t.is_trailing_underscore) {
- invalid_char_error(&t, c);
- break;
- }
- t.state = TokenizeStateNumberDot;
+ case 'p':
+ case 'P':
+ t.state = TokenizeState_float_exponent_unsigned;
break;
- }
- if (is_exponent_signifier(c, t.radix)) {
- if (t.is_trailing_underscore) {
- invalid_char_error(&t, c);
- break;
- }
- if (t.radix != 16 && t.radix != 10) {
- invalid_char_error(&t, c);
- }
- t.state = TokenizeStateFloatExponentUnsigned;
- t.radix = 10; // exponent is always base 10
- assert(t.cur_tok->id == TokenIdIntLiteral);
- set_token_id(&t, t.cur_tok, TokenIdFloatLiteral);
+ case HEXDIGIT:
break;
- }
- uint32_t digit_value = get_digit_value(c);
- if (digit_value >= t.radix) {
- if (t.is_trailing_underscore) {
- invalid_char_error(&t, c);
- break;
- }
-
- if (is_symbol_char(c)) {
- invalid_char_error(&t, c);
- }
- // not my char
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
- continue;
- }
- BigInt digit_value_bi;
- bigint_init_unsigned(&digit_value_bi, digit_value);
-
- BigInt radix_bi;
- bigint_init_unsigned(&radix_bi, t.radix);
-
- BigInt multiplied;
- bigint_mul(&multiplied, &t.cur_tok->data.int_lit.bigint, &radix_bi);
-
- bigint_add(&t.cur_tok->data.int_lit.bigint, &multiplied, &digit_value_bi);
- break;
- }
- case TokenizeStateNumberDot:
- {
- if (c == '.') {
- t.pos -= 2;
- end_token(&t);
- t.state = TokenizeStateStart;
- continue;
- }
- if (t.radix != 16 && t.radix != 10) {
+ case ALPHA_EXCEPT_HEX_AND_P:
invalid_char_error(&t, c);
- }
- t.pos -= 1;
- t.state = TokenizeStateFloatFractionNoUnderscore;
- assert(t.cur_tok->id == TokenIdIntLiteral);
- set_token_id(&t, t.cur_tok, TokenIdFloatLiteral);
- continue;
- }
- case TokenizeStateFloatFractionNoUnderscore:
- if (c == '_') {
- invalid_char_error(&t, c);
- } else if (get_digit_value(c) < t.radix) {
- t.is_trailing_underscore = false;
- t.state = TokenizeStateFloatFraction;
- }
- ZIG_FALLTHROUGH;
- case TokenizeStateFloatFraction:
- {
- if (c == '_') {
- t.is_trailing_underscore = true;
- t.state = TokenizeStateFloatFractionNoUnderscore;
break;
- }
- if (is_exponent_signifier(c, t.radix)) {
- if (t.is_trailing_underscore) {
- invalid_char_error(&t, c);
- break;
- }
- t.state = TokenizeStateFloatExponentUnsigned;
- t.radix = 10; // exponent is always base 10
- break;
- }
- uint32_t digit_value = get_digit_value(c);
- if (digit_value >= t.radix) {
- if (t.is_trailing_underscore) {
- invalid_char_error(&t, c);
- break;
- }
- if (is_symbol_char(c)) {
- invalid_char_error(&t, c);
- }
- // not my char
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ default:
+ t.state = TokenizeState_start;
continue;
- }
-
- // we use parse_f128 to generate the float literal, so just
- // need to get to the end of the token
}
break;
- case TokenizeStateFloatExponentUnsigned:
+ case TokenizeState_float_exponent_unsigned:
switch (c) {
case '+':
- t.state = TokenizeStateFloatExponentNumberNoUnderscore;
- break;
case '-':
- t.state = TokenizeStateFloatExponentNumberNoUnderscore;
+ t.state = TokenizeState_float_exponent_num_no_underscore;
break;
default:
- // reinterpret as normal exponent number
- t.pos -= 1;
- t.state = TokenizeStateFloatExponentNumberNoUnderscore;
+ // Reinterpret as a normal exponent number.
+ t.state = TokenizeState_float_exponent_num_no_underscore;
continue;
}
break;
- case TokenizeStateFloatExponentNumberNoUnderscore:
- if (c == '_') {
- invalid_char_error(&t, c);
- } else if (get_digit_value(c) < t.radix) {
- t.is_trailing_underscore = false;
- t.state = TokenizeStateFloatExponentNumber;
- }
- ZIG_FALLTHROUGH;
- case TokenizeStateFloatExponentNumber:
- {
- if (c == '_') {
- t.is_trailing_underscore = true;
- t.state = TokenizeStateFloatExponentNumberNoUnderscore;
- break;
- }
- uint32_t digit_value = get_digit_value(c);
- if (digit_value >= t.radix) {
- if (t.is_trailing_underscore) {
- invalid_char_error(&t, c);
- break;
- }
- if (is_symbol_char(c)) {
- invalid_char_error(&t, c);
- }
- // not my char
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
- continue;
- }
-
- // we use parse_f128 to generate the float literal, so just
- // need to get to the end of the token
- }
- break;
- case TokenizeStateSawDash:
+ case TokenizeState_float_exponent_num_no_underscore:
switch (c) {
- case '>':
- set_token_id(&t, t.cur_tok, TokenIdArrow);
- end_token(&t);
- t.state = TokenizeStateStart;
- break;
- case '=':
- set_token_id(&t, t.cur_tok, TokenIdMinusEq);
- end_token(&t);
- t.state = TokenizeStateStart;
- break;
- case '%':
- set_token_id(&t, t.cur_tok, TokenIdMinusPercent);
- t.state = TokenizeStateSawMinusPercent;
+ case DIGIT:
+ t.state = TokenizeState_float_exponent_num;
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
- continue;
+ invalid_char_error(&t, c);
}
break;
- case TokenizeStateSawMinusPercent:
+ case TokenizeState_float_exponent_num:
switch (c) {
- case '=':
- set_token_id(&t, t.cur_tok, TokenIdMinusPercentEq);
- end_token(&t);
- t.state = TokenizeStateStart;
+ case 0:
+ goto eof;
+ case '_':
+ t.state = TokenizeState_float_exponent_num_no_underscore;
+ break;
+ case DIGIT:
+ break;
+ case ALPHA:
+ invalid_char_error(&t, c);
break;
default:
- t.pos -= 1;
- end_token(&t);
- t.state = TokenizeStateStart;
+ t.state = TokenizeState_start;
continue;
}
break;
}
+ t.pos += 1;
if (c == '\n') {
- out->line_offsets->append(t.pos + 1);
t.line += 1;
t.column = 0;
} else {
t.column += 1;
}
}
- // EOF
- switch (t.state) {
- case TokenizeStateStart:
- case TokenizeStateError:
- break;
- case TokenizeStateNumberNoUnderscore:
- case TokenizeStateFloatFractionNoUnderscore:
- case TokenizeStateFloatExponentNumberNoUnderscore:
- case TokenizeStateNumberDot:
- tokenize_error(&t, "unterminated number literal");
- break;
- case TokenizeStateString:
- tokenize_error(&t, "unterminated string");
- break;
- case TokenizeStateStringEscape:
- case TokenizeStateStringEscapeUnicodeStart:
- case TokenizeStateCharCode:
- if (t.cur_tok->id == TokenIdStringLiteral) {
- tokenize_error(&t, "unterminated string");
- break;
- } else if (t.cur_tok->id == TokenIdCharLiteral) {
- tokenize_error(&t, "unterminated Unicode code point literal");
- break;
- } else {
- zig_unreachable();
- }
- break;
- case TokenizeStateCharLiteral:
- case TokenizeStateCharLiteralEnd:
- case TokenizeStateCharLiteralUnicode:
- tokenize_error(&t, "unterminated Unicode code point literal");
- break;
- case TokenizeStateSymbol:
- case TokenizeStateZero:
- case TokenizeStateNumber:
- case TokenizeStateFloatFraction:
- case TokenizeStateFloatExponentUnsigned:
- case TokenizeStateFloatExponentNumber:
- case TokenizeStateSawStar:
- case TokenizeStateSawSlash:
- case TokenizeStateSawPercent:
- case TokenizeStateSawPlus:
- case TokenizeStateSawDash:
- case TokenizeStateSawAmpersand:
- case TokenizeStateSawCaret:
- case TokenizeStateSawBar:
- case TokenizeStateSawEq:
- case TokenizeStateSawBang:
- case TokenizeStateSawLessThan:
- case TokenizeStateSawLessThanLessThan:
- case TokenizeStateSawGreaterThan:
- case TokenizeStateSawGreaterThanGreaterThan:
- case TokenizeStateSawDot:
- case TokenizeStateSawDotStar:
- case TokenizeStateSawAtSign:
- case TokenizeStateSawStarPercent:
- case TokenizeStateSawPlusPercent:
- case TokenizeStateSawMinusPercent:
- case TokenizeStateLineString:
- case TokenizeStateLineStringEnd:
- case TokenizeStateDocComment:
- case TokenizeStateContainerDocComment:
- end_token(&t);
- break;
- case TokenizeStateSawDotDot:
- case TokenizeStateSawBackslash:
- case TokenizeStateLineStringContinue:
- tokenize_error(&t, "unexpected EOF");
- break;
- case TokenizeStateLineComment:
- break;
- case TokenizeStateSawSlash2:
- cancel_token(&t);
- break;
- case TokenizeStateSawSlash3:
- set_token_id(&t, t.cur_tok, TokenIdDocComment);
- end_token(&t);
- break;
- case TokenizeStateSawSlashBang:
- set_token_id(&t, t.cur_tok, TokenIdContainerDocComment);
- end_token(&t);
- break;
- }
- if (t.state != TokenizeStateError) {
- if (t.tokens->length > 0) {
- Token *last_token = &t.tokens->last();
- t.line = (int)last_token->start_line;
- t.column = (int)last_token->start_column;
- t.pos = last_token->start_pos;
- } else {
- t.pos = 0;
- }
- begin_token(&t, TokenIdEof);
- end_token(&t);
- assert(!t.cur_tok);
- }
+eof:;
+
+ begin_token(&t, TokenIdEof);
}
const char * token_name(TokenId id) {
switch (id) {
case TokenIdAmpersand: return "&";
case TokenIdArrow: return "->";
- case TokenIdAtSign: return "@";
case TokenIdBang: return "!";
case TokenIdBarBar: return "||";
case TokenIdBinOr: return "|";
@@ -1622,9 +1530,7 @@ const char * token_name(TokenId id) {
case TokenIdMinusPercent: return "-%";
case TokenIdMinusPercentEq: return "-%=";
case TokenIdModEq: return "%=";
- case TokenIdNumberSign: return "#";
case TokenIdPercent: return "%";
- case TokenIdPercentDot: return "%.";
case TokenIdPlus: return "+";
case TokenIdPlusEq: return "+=";
case TokenIdPlusPercent: return "+%";
@@ -1638,33 +1544,15 @@ const char * token_name(TokenId id) {
case TokenIdStar: return "*";
case TokenIdStarStar: return "**";
case TokenIdStringLiteral: return "StringLiteral";
- case TokenIdMultilineStringLiteral: return "MultilineStringLiteral";
- case TokenIdSymbol: return "Symbol";
+ case TokenIdMultilineStringLiteralLine: return "MultilineStringLiteralLine";
+ case TokenIdIdentifier: return "Identifier";
case TokenIdTilde: return "~";
case TokenIdTimesEq: return "*=";
case TokenIdTimesPercent: return "*%";
case TokenIdTimesPercentEq: return "*%=";
+ case TokenIdBuiltin: return "Builtin";
case TokenIdCount:
zig_unreachable();
}
return "(invalid token)";
}
-
-void print_tokens(Buf *buf, ZigList<Token> *tokens) {
- for (size_t i = 0; i < tokens->length; i += 1) {
- Token *token = &tokens->at(i);
- fprintf(stderr, "%s ", token_name(token->id));
- if (token->start_pos != SIZE_MAX) {
- fwrite(buf_ptr(buf) + token->start_pos, 1, token->end_pos - token->start_pos, stderr);
- }
- fprintf(stderr, "\n");
- }
-}
-
-bool valid_symbol_starter(uint8_t c) {
- switch (c) {
- case SYMBOL_START:
- return true;
- }
- return false;
-}