unicode character literals

author: Nick Erdmann <n@nirf.de> 2019-10-06 19:52:35 +0200
committer: Nick Erdmann <n@nirf.de> 2019-10-07 08:18:16 +0200
commit: ae7392e504e7765b05d98636cc249cbf92233f5c (patch)
tree: 42fa9e83943c41a4e84e0152b1deb427dfbdc292 /lib/std
parent: 571123465b2e030b7b9cf42732ed30f77192fbcd (diff)
download: zig-ae7392e504e7765b05d98636cc249cbf92233f5c.tar.gz
zig-ae7392e504e7765b05d98636cc249cbf92233f5c.zip
1 files changed, 35 insertions, 6 deletions
diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig
index b0a6cd1122..872893a607 100644
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@@ -371,6 +371,7 @@ pub const Tokenizer = struct {
         CharLiteralUnicodeEscapeSawU,
         CharLiteralUnicodeEscape,
         CharLiteralUnicodeInvalid,
+        CharLiteralUnicode,
         CharLiteralEnd,
         Backslash,
         Equal,
@@ -427,6 +428,7 @@ pub const Tokenizer = struct {
             .end = undefined,
         };
         var seen_escape_digits: usize = undefined;
+        var remaining_code_units: usize = undefined;
         while (self.index < self.buffer.len) : (self.index += 1) {
             const c = self.buffer[self.index];
             switch (state) {
@@ -774,16 +776,23 @@ pub const Tokenizer = struct {
                     '\\' => {
                         state = State.CharLiteralBackslash;
                     },
-                    '\'' => {
+                    '\'', 0x80...0xbf, 0xf8...0xff => {
                         result.id = Token.Id.Invalid;
                         break;
                     },
+                    0xc0...0xdf => { // 110xxxxx
+                        remaining_code_units = 1;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xe0...0xef => { // 1110xxxx
+                        remaining_code_units = 2;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xf0...0xf7 => { // 11110xxx
+                        remaining_code_units = 3;
+                        state = State.CharLiteralUnicode;
+                    },
                     else => {
-                        if (c < 0x20 or c == 0x7f) {
-                            result.id = Token.Id.Invalid;
-                            break;
-                        }
-
                         state = State.CharLiteralEnd;
                     },
                 },
@@ -867,6 +876,19 @@ pub const Tokenizer = struct {
                     },
                 },
 
+                State.CharLiteralUnicode => switch (c) {
+                    0x80...0xbf => {
+                        remaining_code_units -= 1;
+                        if (remaining_code_units == 0) {
+                            state = State.CharLiteralEnd;
+                        }
+                    },
+                    else => {
+                        result.id = Token.Id.Invalid;
+                        break;
+                    },
+                },
+
                 State.MultilineStringLiteralLine => switch (c) {
                     '\n' => {
                         self.index += 1;
@@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
                 State.CharLiteralUnicodeEscape,
                 State.CharLiteralUnicodeInvalid,
                 State.CharLiteralEnd,
+                State.CharLiteralUnicode,
                 State.StringLiteralBackslash,
                 State.LBracketStar,
                 State.LBracketStarC,
@@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
     , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
 }
 
+test "tokenizer - char literal with unicode code point" {
+    testTokenize(
+        \\'💩'
+    , [_]Token.Id{.CharLiteral});
+}
+
 test "tokenizer - float literal e exponent" {
     testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
         Token.Id.Identifier,
author	Nick Erdmann <n@nirf.de>	2019-10-06 19:52:35 +0200
committer	Nick Erdmann <n@nirf.de>	2019-10-07 08:18:16 +0200
commit	ae7392e504e7765b05d98636cc249cbf92233f5c (patch)
tree	42fa9e83943c41a4e84e0152b1deb427dfbdc292 /lib/std
parent	571123465b2e030b7b9cf42732ed30f77192fbcd (diff)
download	zig-ae7392e504e7765b05d98636cc249cbf92233f5c.tar.gz zig-ae7392e504e7765b05d98636cc249cbf92233f5c.zip