From 6bfa8546bbdf6dd644a65876135893339b767bba Mon Sep 17 00:00:00 2001
From: hryx <codroid@gmail.com>
Date: Thu, 4 Jul 2019 22:40:19 -0700
Subject: Unicode escapes: stage1 tokenizer and behavior tests

---
 src/tokenizer.cpp | 115 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 61 insertions(+), 54 deletions(-)

(limited to 'src/tokenizer.cpp')

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index a0acde52e9..4358146f24 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -190,6 +190,7 @@ enum TokenizeState {
     TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
     TokenizeStateString,
     TokenizeStateStringEscape,
+    TokenizeStateStringEscapeUnicodeStart,
     TokenizeStateCharLiteral,
     TokenizeStateCharLiteralEnd,
     TokenizeStateSawStar,
@@ -241,7 +242,6 @@ struct Tokenize {
     int32_t exp_add_amt;
     bool is_exp_negative;
     size_t char_code_index;
-    size_t char_code_end;
     bool unicode;
     uint32_t char_code;
     int exponent_in_bin_or_dec;
@@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) {
                         t.radix = 16;
                         t.char_code = 0;
                         t.char_code_index = 0;
-                        t.char_code_end = 2;
                         t.unicode = false;
                         break;
                     case 'u':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 4;
-                        t.unicode = true;
-                        break;
-                    case 'U':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 6;
-                        t.unicode = true;
+                        t.state = TokenizeStateStringEscapeUnicodeStart;
                         break;
                     case 'n':
                         handle_string_escape(&t, '\n');
@@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) {
                         invalid_char_error(&t, c);
                 }
                 break;
+            case TokenizeStateStringEscapeUnicodeStart:
+                switch (c) {
+                    case '{':
+                        t.state = TokenizeStateCharCode;
+                        t.radix = 16;
+                        t.char_code = 0;
+                        t.char_code_index = 0;
+                        t.unicode = true;
+                        break;
+                    default:
+                        invalid_char_error(&t, c);
+                }
+                break;
             case TokenizeStateCharCode:
                 {
+                    if (t.unicode && c == '}') {
+                        if (t.char_code_index == 0) {
+                            tokenize_error(&t, "empty unicode escape sequence");
+                            break;
+                        }
+                        if (t.char_code > 0x10ffff) {
+                            tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+                            break;
+                        }
+                        if (t.cur_tok->id == TokenIdCharLiteral) {
+                            t.cur_tok->data.char_lit.c = t.char_code;
+                            t.state = TokenizeStateCharLiteralEnd;
+                        } else if (t.char_code <= 0x7f) {
+                            // 00000000 00000000 00000000 0xxxxxxx
+                            handle_string_escape(&t, (uint8_t)t.char_code);
+                        } else if (t.char_code <= 0x7ff) {
+                            // 00000000 00000000 00000xxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else if (t.char_code <= 0xffff) {
+                            // 00000000 00000000 xxxx0000 00000000
+                            handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else if (t.char_code <= 0x10ffff) {
+                            // 00000000 000xxx00 00000000 00000000
+                            handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
+                            // 00000000 000000xx xxxx0000 00000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else {
+                            zig_unreachable();
+                        }
+                        break;
+                    }
+
                     uint32_t digit_value = get_digit_value(c);
                     if (digit_value >= t.radix) {
                         tokenize_error(&t, "invalid digit: '%c'", c);
@@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) {
                     t.char_code += digit_value;
                     t.char_code_index += 1;
 
-                    if (t.char_code_index >= t.char_code_end) {
-                        if (t.unicode) {
-                            if (t.char_code > 0x10ffff) {
-                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
-                                break;
-                            }
-                            if (t.cur_tok->id == TokenIdCharLiteral) {
-                                t.cur_tok->data.char_lit.c = t.char_code;
-                                t.state = TokenizeStateCharLiteralEnd;
-                            } else if (t.char_code <= 0x7f) {
-                                // 00000000 00000000 00000000 0xxxxxxx
-                                handle_string_escape(&t, (uint8_t)t.char_code);
-                            } else if (t.char_code <= 0x7ff) {
-                                // 00000000 00000000 00000xxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else if (t.char_code <= 0xffff) {
-                                // 00000000 00000000 xxxx0000 00000000
-                                handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
-                                // 00000000 00000000 0000xxxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else if (t.char_code <= 0x10ffff) {
-                                // 00000000 000xxx00 00000000 00000000
-                                handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
-                                // 00000000 000000xx xxxx0000 00000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
-                                // 00000000 00000000 0000xxxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            }
-                        } else {
-                            assert(t.char_code <= 255);
-                            handle_string_escape(&t, (uint8_t)t.char_code);
-                        }
+                    if (!t.unicode && t.char_code_index >= 2) {
+                        assert(t.char_code <= 255);
+                        handle_string_escape(&t, (uint8_t)t.char_code);
                     }
                 }
                 break;
@@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) {
             tokenize_error(&t, "unterminated string");
             break;
         case TokenizeStateStringEscape:
+        case TokenizeStateStringEscapeUnicodeStart:
         case TokenizeStateCharCode:
             if (t.cur_tok->id == TokenIdStringLiteral) {
                 tokenize_error(&t, "unterminated string");
-- 
cgit v1.2.3


From cce11a724cfe8c266d961a422852a977a1601c6f Mon Sep 17 00:00:00 2001
From: hryx <codroid@gmail.com>
Date: Sat, 6 Jul 2019 15:57:32 -0700
Subject: Make anyerror not a keyword

---
 doc/langref.html.in   | 4 +---
 src/parser.cpp        | 5 -----
 src/tokenizer.cpp     | 2 --
 src/tokenizer.hpp     | 1 -
 std/zig/parse.zig     | 2 --
 std/zig/tokenizer.zig | 2 --
 6 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'src/tokenizer.cpp')

diff --git a/doc/langref.html.in b/doc/langref.html.in
index ae5744c52b..5437dcc801 100644
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@@ -9798,7 +9798,6 @@ PrimaryTypeExpr
      / IDENTIFIER
      / IfTypeExpr
      / INTEGER
-     / KEYWORD_anyerror
      / KEYWORD_comptime TypeExpr
      / KEYWORD_error DOT IDENTIFIER
      / KEYWORD_false
@@ -10104,7 +10103,6 @@ end_of_word &lt;- ![a-zA-Z0-9_] skip
 KEYWORD_align       &lt;- 'align'       end_of_word
 KEYWORD_allowzero   &lt;- 'allowzero'   end_of_word
 KEYWORD_and         &lt;- 'and'         end_of_word
-KEYWORD_anyerror    &lt;- 'anyerror'    end_of_word
 KEYWORD_asm         &lt;- 'asm'         end_of_word
 KEYWORD_async       &lt;- 'async'       end_of_word
 KEYWORD_await       &lt;- 'await'       end_of_word
@@ -10153,7 +10151,7 @@ KEYWORD_var         &lt;- 'var'         end_of_word
 KEYWORD_volatile    &lt;- 'volatile'    end_of_word
 KEYWORD_while       &lt;- 'while'       end_of_word
 
-keyword &lt;- KEYWORD_align / KEYWORD_and / KEYWORD_allowzero / KEYWORD_anyerror / KEYWORD_asm
+keyword &lt;- KEYWORD_align / KEYWORD_and / KEYWORD_allowzero / KEYWORD_asm
          / KEYWORD_async / KEYWORD_await / KEYWORD_break / KEYWORD_cancel
          / KEYWORD_catch / KEYWORD_comptime / KEYWORD_const / KEYWORD_continue
          / KEYWORD_defer / KEYWORD_else / KEYWORD_enum / KEYWORD_errdefer
diff --git a/src/parser.cpp b/src/parser.cpp
index f35e54f6de..25541d5351 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -1519,7 +1519,6 @@ static AstNode *ast_parse_suffix_expr(ParseContext *pc) {
 //      / IDENTIFIER
 //      / IfTypeExpr
 //      / INTEGER
-//      / KEYWORD_anyerror
 //      / KEYWORD_comptime TypeExpr
 //      / KEYWORD_error DOT IDENTIFIER
 //      / KEYWORD_false
@@ -1614,10 +1613,6 @@ static AstNode *ast_parse_primary_type_expr(ParseContext *pc) {
         return res;
     }
 
-    Token *error_type = eat_token_if(pc, TokenIdKeywordAnyerror);
-    if (error_type != nullptr)
-        return ast_create_node(pc, NodeTypeErrorType, error_type);
-
     Token *comptime = eat_token_if(pc, TokenIdKeywordCompTime);
     if (comptime != nullptr) {
         AstNode *expr = ast_expect(pc, ast_parse_type_expr);
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 4358146f24..783b6e0e20 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -109,7 +109,6 @@ static const struct ZigKeyword zig_keywords[] = {
     {"align", TokenIdKeywordAlign},
     {"allowzero", TokenIdKeywordAllowZero},
     {"and", TokenIdKeywordAnd},
-    {"anyerror", TokenIdKeywordAnyerror},
     {"asm", TokenIdKeywordAsm},
     {"async", TokenIdKeywordAsync},
     {"await", TokenIdKeywordAwait},
@@ -1528,7 +1527,6 @@ const char * token_name(TokenId id) {
         case TokenIdFloatLiteral: return "FloatLiteral";
         case TokenIdIntLiteral: return "IntLiteral";
         case TokenIdKeywordAsync: return "async";
-        case TokenIdKeywordAnyerror: return "anyerror";
         case TokenIdKeywordAllowZero: return "allowzero";
         case TokenIdKeywordAwait: return "await";
         case TokenIdKeywordResume: return "resume";
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
index d5174c24de..83dbe99471 100644
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@@ -53,7 +53,6 @@ enum TokenId {
     TokenIdKeywordAlign,
     TokenIdKeywordAllowZero,
     TokenIdKeywordAnd,
-    TokenIdKeywordAnyerror,
     TokenIdKeywordAsm,
     TokenIdKeywordAsync,
     TokenIdKeywordAwait,
diff --git a/std/zig/parse.zig b/std/zig/parse.zig
index da258c9237..4f3fb76b54 100644
--- a/std/zig/parse.zig
+++ b/std/zig/parse.zig
@@ -1197,7 +1197,6 @@ fn parseSuffixExpr(arena: *Allocator, it: *TokenIterator, tree: *Tree) !?*Node {
 ///      / IDENTIFIER
 ///      / IfTypeExpr
 ///      / INTEGER
-///      / KEYWORD_anyerror
 ///      / KEYWORD_comptime TypeExpr
 ///      / KEYWORD_error DOT IDENTIFIER
 ///      / KEYWORD_false
@@ -1228,7 +1227,6 @@ fn parsePrimaryTypeExpr(arena: *Allocator, it: *TokenIterator, tree: *Tree) !?*N
     if (try parseIdentifier(arena, it, tree)) |node| return node;
     if (try parseIfTypeExpr(arena, it, tree)) |node| return node;
     if (try parseIntegerLiteral(arena, it, tree)) |node| return node;
-    if (eatToken(it, .Keyword_anyerror)) |token| return createLiteral(arena, Node.ErrorType, token);
     if (eatToken(it, .Keyword_comptime)) |token| {
         const expr = (try parseTypeExpr(arena, it, tree)) orelse return null;
         const node = try arena.create(Node.Comptime);
diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig
index 71765e2025..7bd5c537d3 100644
--- a/std/zig/tokenizer.zig
+++ b/std/zig/tokenizer.zig
@@ -15,7 +15,6 @@ pub const Token = struct {
         Keyword{ .bytes = "align", .id = Id.Keyword_align },
         Keyword{ .bytes = "allowzero", .id = Id.Keyword_allowzero },
         Keyword{ .bytes = "and", .id = Id.Keyword_and },
-        Keyword{ .bytes = "anyerror", .id = Id.Keyword_anyerror },
         Keyword{ .bytes = "asm", .id = Id.Keyword_asm },
         Keyword{ .bytes = "async", .id = Id.Keyword_async },
         Keyword{ .bytes = "await", .id = Id.Keyword_await },
@@ -147,7 +146,6 @@ pub const Token = struct {
         Keyword_align,
         Keyword_allowzero,
         Keyword_and,
-        Keyword_anyerror,
         Keyword_asm,
         Keyword_async,
         Keyword_await,
-- 
cgit v1.2.3