Merge pull request #3390 from nrdmn/unicode_character_literals

unicode character literals
author: Andrew Kelley <andrew@ziglang.org> 2019-10-09 13:25:41 -0400
committer: GitHub <noreply@github.com> 2019-10-09 13:25:41 -0400
commit: 406b70aa56b9a95e768c321dd3caf164add1b49a (patch)
tree: dc89654bce360751073bd712fe51b8c023900636 /lib/std
parent: f929a58d5f69c26c25ced89f31f60d0a92ffc46a (diff)
parent: ae7392e504e7765b05d98636cc249cbf92233f5c (diff)
download: zig-406b70aa56b9a95e768c321dd3caf164add1b49a.tar.gz
zig-406b70aa56b9a95e768c321dd3caf164add1b49a.zip
1 files changed, 35 insertions, 6 deletions
diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig
index b0a6cd1122..872893a607 100644
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@@ -371,6 +371,7 @@ pub const Tokenizer = struct {
         CharLiteralUnicodeEscapeSawU,
         CharLiteralUnicodeEscape,
         CharLiteralUnicodeInvalid,
+        CharLiteralUnicode,
         CharLiteralEnd,
         Backslash,
         Equal,
@@ -427,6 +428,7 @@ pub const Tokenizer = struct {
             .end = undefined,
         };
         var seen_escape_digits: usize = undefined;
+        var remaining_code_units: usize = undefined;
         while (self.index < self.buffer.len) : (self.index += 1) {
             const c = self.buffer[self.index];
             switch (state) {
@@ -774,16 +776,23 @@ pub const Tokenizer = struct {
                     '\\' => {
                         state = State.CharLiteralBackslash;
                     },
-                    '\'' => {
+                    '\'', 0x80...0xbf, 0xf8...0xff => {
                         result.id = Token.Id.Invalid;
                         break;
                     },
+                    0xc0...0xdf => { // 110xxxxx
+                        remaining_code_units = 1;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xe0...0xef => { // 1110xxxx
+                        remaining_code_units = 2;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xf0...0xf7 => { // 11110xxx
+                        remaining_code_units = 3;
+                        state = State.CharLiteralUnicode;
+                    },
                     else => {
-                        if (c < 0x20 or c == 0x7f) {
-                            result.id = Token.Id.Invalid;
-                            break;
-                        }
-
                         state = State.CharLiteralEnd;
                     },
                 },
@@ -867,6 +876,19 @@ pub const Tokenizer = struct {
                     },
                 },
 
+                State.CharLiteralUnicode => switch (c) {
+                    0x80...0xbf => {
+                        remaining_code_units -= 1;
+                        if (remaining_code_units == 0) {
+                            state = State.CharLiteralEnd;
+                        }
+                    },
+                    else => {
+                        result.id = Token.Id.Invalid;
+                        break;
+                    },
+                },
+
                 State.MultilineStringLiteralLine => switch (c) {
                     '\n' => {
                         self.index += 1;
@@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
                 State.CharLiteralUnicodeEscape,
                 State.CharLiteralUnicodeInvalid,
                 State.CharLiteralEnd,
+                State.CharLiteralUnicode,
                 State.StringLiteralBackslash,
                 State.LBracketStar,
                 State.LBracketStarC,
@@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
     , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
 }
 
+test "tokenizer - char literal with unicode code point" {
+    testTokenize(
+        \\'💩'
+    , [_]Token.Id{.CharLiteral});
+}
+
 test "tokenizer - float literal e exponent" {
     testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
         Token.Id.Identifier,
author	Andrew Kelley <andrew@ziglang.org>	2019-10-09 13:25:41 -0400
committer	GitHub <noreply@github.com>	2019-10-09 13:25:41 -0400
commit	406b70aa56b9a95e768c321dd3caf164add1b49a (patch)
tree	dc89654bce360751073bd712fe51b8c023900636 /lib/std
parent	f929a58d5f69c26c25ced89f31f60d0a92ffc46a (diff)
parent	ae7392e504e7765b05d98636cc249cbf92233f5c (diff)
download	zig-406b70aa56b9a95e768c321dd3caf164add1b49a.tar.gz zig-406b70aa56b9a95e768c321dd3caf164add1b49a.zip