diff options
| author | Nick Erdmann <n@nirf.de> | 2019-10-06 19:52:35 +0200 |
|---|---|---|
| committer | Nick Erdmann <n@nirf.de> | 2019-10-07 08:18:16 +0200 |
| commit | ae7392e504e7765b05d98636cc249cbf92233f5c (patch) | |
| tree | 42fa9e83943c41a4e84e0152b1deb427dfbdc292 /lib/std | |
| parent | 571123465b2e030b7b9cf42732ed30f77192fbcd (diff) | |
| download | zig-ae7392e504e7765b05d98636cc249cbf92233f5c.tar.gz zig-ae7392e504e7765b05d98636cc249cbf92233f5c.zip | |
unicode character literals
Diffstat (limited to 'lib/std')
| -rw-r--r-- | lib/std/zig/tokenizer.zig | 41 |
1 files changed, 35 insertions, 6 deletions
diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index b0a6cd1122..872893a607 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -371,6 +371,7 @@ pub const Tokenizer = struct { CharLiteralUnicodeEscapeSawU, CharLiteralUnicodeEscape, CharLiteralUnicodeInvalid, + CharLiteralUnicode, CharLiteralEnd, Backslash, Equal, @@ -427,6 +428,7 @@ pub const Tokenizer = struct { .end = undefined, }; var seen_escape_digits: usize = undefined; + var remaining_code_units: usize = undefined; while (self.index < self.buffer.len) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { @@ -774,16 +776,23 @@ pub const Tokenizer = struct { '\\' => { state = State.CharLiteralBackslash; }, - '\'' => { + '\'', 0x80...0xbf, 0xf8...0xff => { result.id = Token.Id.Invalid; break; }, + 0xc0...0xdf => { // 110xxxxx + remaining_code_units = 1; + state = State.CharLiteralUnicode; + }, + 0xe0...0xef => { // 1110xxxx + remaining_code_units = 2; + state = State.CharLiteralUnicode; + }, + 0xf0...0xf7 => { // 11110xxx + remaining_code_units = 3; + state = State.CharLiteralUnicode; + }, else => { - if (c < 0x20 or c == 0x7f) { - result.id = Token.Id.Invalid; - break; - } - state = State.CharLiteralEnd; }, }, @@ -867,6 +876,19 @@ pub const Tokenizer = struct { }, }, + State.CharLiteralUnicode => switch (c) { + 0x80...0xbf => { + remaining_code_units -= 1; + if (remaining_code_units == 0) { + state = State.CharLiteralEnd; + } + }, + else => { + result.id = Token.Id.Invalid; + break; + }, + }, + State.MultilineStringLiteralLine => switch (c) { '\n' => { self.index += 1; @@ -1220,6 +1242,7 @@ pub const Tokenizer = struct { State.CharLiteralUnicodeEscape, State.CharLiteralUnicodeInvalid, State.CharLiteralEnd, + State.CharLiteralUnicode, State.StringLiteralBackslash, State.LBracketStar, State.LBracketStarC, @@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" { , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); } +test "tokenizer - char literal with unicode code point" { + testTokenize( + \\'💩' + , [_]Token.Id{.CharLiteral}); +} + test "tokenizer - float literal e exponent" { testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{ Token.Id.Identifier, |
