From 6bfa8546bbdf6dd644a65876135893339b767bba Mon Sep 17 00:00:00 2001 From: hryx Date: Thu, 4 Jul 2019 22:40:19 -0700 Subject: Unicode escapes: stage1 tokenizer and behavior tests --- src/tokenizer.cpp | 115 +++++++++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 54 deletions(-) (limited to 'src/tokenizer.cpp') diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index a0acde52e9..4358146f24 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -190,6 +190,7 @@ enum TokenizeState { TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" TokenizeStateString, TokenizeStateStringEscape, + TokenizeStateStringEscapeUnicodeStart, TokenizeStateCharLiteral, TokenizeStateCharLiteralEnd, TokenizeStateSawStar, @@ -241,7 +242,6 @@ struct Tokenize { int32_t exp_add_amt; bool is_exp_negative; size_t char_code_index; - size_t char_code_end; bool unicode; uint32_t char_code; int exponent_in_bin_or_dec; @@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) { t.radix = 16; t.char_code = 0; t.char_code_index = 0; - t.char_code_end = 2; t.unicode = false; break; case 'u': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 4; - t.unicode = true; - break; - case 'U': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 6; - t.unicode = true; + t.state = TokenizeStateStringEscapeUnicodeStart; break; case 'n': handle_string_escape(&t, '\n'); @@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateStringEscapeUnicodeStart: + switch (c) { + case '{': + t.state = TokenizeStateCharCode; + t.radix = 16; + t.char_code = 0; + t.char_code_index = 0; + t.unicode = true; + break; + default: + invalid_char_error(&t, c); + } + break; case TokenizeStateCharCode: { + if (t.unicode && c == '}') { + if (t.char_code_index == 0) { + tokenize_error(&t, "empty unicode escape sequence"); + break; + } + if (t.char_code > 0x10ffff) { + tokenize_error(&t, "unicode value out of range: %x", t.char_code); + break; + } + if (t.cur_tok->id == TokenIdCharLiteral) { + t.cur_tok->data.char_lit.c = t.char_code; + t.state = TokenizeStateCharLiteralEnd; + } else if (t.char_code <= 0x7f) { + // 00000000 00000000 00000000 0xxxxxxx + handle_string_escape(&t, (uint8_t)t.char_code); + } else if (t.char_code <= 0x7ff) { + // 00000000 00000000 00000xxx xx000000 + handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else if (t.char_code <= 0xffff) { + // 00000000 00000000 xxxx0000 00000000 + handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else if (t.char_code <= 0x10ffff) { + // 00000000 000xxx00 00000000 00000000 + handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); + // 00000000 000000xx xxxx0000 00000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else { + zig_unreachable(); + } + break; + } + uint32_t digit_value = get_digit_value(c); if (digit_value >= t.radix) { tokenize_error(&t, "invalid digit: '%c'", c); @@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) { t.char_code += digit_value; t.char_code_index += 1; - if (t.char_code_index >= t.char_code_end) { - if (t.unicode) { - if (t.char_code > 0x10ffff) { - tokenize_error(&t, "unicode value out of range: %x", t.char_code); - break; - } - if (t.cur_tok->id == TokenIdCharLiteral) { - t.cur_tok->data.char_lit.c = t.char_code; - t.state = TokenizeStateCharLiteralEnd; - } else if (t.char_code <= 0x7f) { - // 00000000 00000000 00000000 0xxxxxxx - handle_string_escape(&t, (uint8_t)t.char_code); - } else if (t.char_code <= 0x7ff) { - // 00000000 00000000 00000xxx xx000000 - handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else if (t.char_code <= 0xffff) { - // 00000000 00000000 xxxx0000 00000000 - handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); - // 00000000 00000000 0000xxxx xx000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else if (t.char_code <= 0x10ffff) { - // 00000000 000xxx00 00000000 00000000 - handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); - // 00000000 000000xx xxxx0000 00000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); - // 00000000 00000000 0000xxxx xx000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } - } else { - assert(t.char_code <= 255); - handle_string_escape(&t, (uint8_t)t.char_code); - } + if (!t.unicode && t.char_code_index >= 2) { + assert(t.char_code <= 255); + handle_string_escape(&t, (uint8_t)t.char_code); } } break; @@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) { tokenize_error(&t, "unterminated string"); break; case TokenizeStateStringEscape: + case TokenizeStateStringEscapeUnicodeStart: case TokenizeStateCharCode: if (t.cur_tok->id == TokenIdStringLiteral) { tokenize_error(&t, "unterminated string"); -- cgit v1.2.3 From cce11a724cfe8c266d961a422852a977a1601c6f Mon Sep 17 00:00:00 2001 From: hryx Date: Sat, 6 Jul 2019 15:57:32 -0700 Subject: Make anyerror not a keyword --- doc/langref.html.in | 4 +--- src/parser.cpp | 5 ----- src/tokenizer.cpp | 2 -- src/tokenizer.hpp | 1 - std/zig/parse.zig | 2 -- std/zig/tokenizer.zig | 2 -- 6 files changed, 1 insertion(+), 15 deletions(-) (limited to 'src/tokenizer.cpp') diff --git a/doc/langref.html.in b/doc/langref.html.in index ae5744c52b..5437dcc801 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -9798,7 +9798,6 @@ PrimaryTypeExpr / IDENTIFIER / IfTypeExpr / INTEGER - / KEYWORD_anyerror / KEYWORD_comptime TypeExpr / KEYWORD_error DOT IDENTIFIER / KEYWORD_false @@ -10104,7 +10103,6 @@ end_of_word <- ![a-zA-Z0-9_] skip KEYWORD_align <- 'align' end_of_word KEYWORD_allowzero <- 'allowzero' end_of_word KEYWORD_and <- 'and' end_of_word -KEYWORD_anyerror <- 'anyerror' end_of_word KEYWORD_asm <- 'asm' end_of_word KEYWORD_async <- 'async' end_of_word KEYWORD_await <- 'await' end_of_word @@ -10153,7 +10151,7 @@ KEYWORD_var <- 'var' end_of_word KEYWORD_volatile <- 'volatile' end_of_word KEYWORD_while <- 'while' end_of_word -keyword <- KEYWORD_align / KEYWORD_and / KEYWORD_allowzero / KEYWORD_anyerror / KEYWORD_asm +keyword <- KEYWORD_align / KEYWORD_and / KEYWORD_allowzero / KEYWORD_asm / KEYWORD_async / KEYWORD_await / KEYWORD_break / KEYWORD_cancel / KEYWORD_catch / KEYWORD_comptime / KEYWORD_const / KEYWORD_continue / KEYWORD_defer / KEYWORD_else / KEYWORD_enum / KEYWORD_errdefer diff --git a/src/parser.cpp b/src/parser.cpp index f35e54f6de..25541d5351 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -1519,7 +1519,6 @@ static AstNode *ast_parse_suffix_expr(ParseContext *pc) { // / IDENTIFIER // / IfTypeExpr // / INTEGER -// / KEYWORD_anyerror // / KEYWORD_comptime TypeExpr // / KEYWORD_error DOT IDENTIFIER // / KEYWORD_false @@ -1614,10 +1613,6 @@ static AstNode *ast_parse_primary_type_expr(ParseContext *pc) { return res; } - Token *error_type = eat_token_if(pc, TokenIdKeywordAnyerror); - if (error_type != nullptr) - return ast_create_node(pc, NodeTypeErrorType, error_type); - Token *comptime = eat_token_if(pc, TokenIdKeywordCompTime); if (comptime != nullptr) { AstNode *expr = ast_expect(pc, ast_parse_type_expr); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 4358146f24..783b6e0e20 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -109,7 +109,6 @@ static const struct ZigKeyword zig_keywords[] = { {"align", TokenIdKeywordAlign}, {"allowzero", TokenIdKeywordAllowZero}, {"and", TokenIdKeywordAnd}, - {"anyerror", TokenIdKeywordAnyerror}, {"asm", TokenIdKeywordAsm}, {"async", TokenIdKeywordAsync}, {"await", TokenIdKeywordAwait}, @@ -1528,7 +1527,6 @@ const char * token_name(TokenId id) { case TokenIdFloatLiteral: return "FloatLiteral"; case TokenIdIntLiteral: return "IntLiteral"; case TokenIdKeywordAsync: return "async"; - case TokenIdKeywordAnyerror: return "anyerror"; case TokenIdKeywordAllowZero: return "allowzero"; case TokenIdKeywordAwait: return "await"; case TokenIdKeywordResume: return "resume"; diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index d5174c24de..83dbe99471 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -53,7 +53,6 @@ enum TokenId { TokenIdKeywordAlign, TokenIdKeywordAllowZero, TokenIdKeywordAnd, - TokenIdKeywordAnyerror, TokenIdKeywordAsm, TokenIdKeywordAsync, TokenIdKeywordAwait, diff --git a/std/zig/parse.zig b/std/zig/parse.zig index da258c9237..4f3fb76b54 100644 --- a/std/zig/parse.zig +++ b/std/zig/parse.zig @@ -1197,7 +1197,6 @@ fn parseSuffixExpr(arena: *Allocator, it: *TokenIterator, tree: *Tree) !?*Node { /// / IDENTIFIER /// / IfTypeExpr /// / INTEGER -/// / KEYWORD_anyerror /// / KEYWORD_comptime TypeExpr /// / KEYWORD_error DOT IDENTIFIER /// / KEYWORD_false @@ -1228,7 +1227,6 @@ fn parsePrimaryTypeExpr(arena: *Allocator, it: *TokenIterator, tree: *Tree) !?*N if (try parseIdentifier(arena, it, tree)) |node| return node; if (try parseIfTypeExpr(arena, it, tree)) |node| return node; if (try parseIntegerLiteral(arena, it, tree)) |node| return node; - if (eatToken(it, .Keyword_anyerror)) |token| return createLiteral(arena, Node.ErrorType, token); if (eatToken(it, .Keyword_comptime)) |token| { const expr = (try parseTypeExpr(arena, it, tree)) orelse return null; const node = try arena.create(Node.Comptime); diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 71765e2025..7bd5c537d3 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -15,7 +15,6 @@ pub const Token = struct { Keyword{ .bytes = "align", .id = Id.Keyword_align }, Keyword{ .bytes = "allowzero", .id = Id.Keyword_allowzero }, Keyword{ .bytes = "and", .id = Id.Keyword_and }, - Keyword{ .bytes = "anyerror", .id = Id.Keyword_anyerror }, Keyword{ .bytes = "asm", .id = Id.Keyword_asm }, Keyword{ .bytes = "async", .id = Id.Keyword_async }, Keyword{ .bytes = "await", .id = Id.Keyword_await }, @@ -147,7 +146,6 @@ pub const Token = struct { Keyword_align, Keyword_allowzero, Keyword_and, - Keyword_anyerror, Keyword_asm, Keyword_async, Keyword_await, -- cgit v1.2.3