diff options
| -rw-r--r-- | doc/langref.html.in | 4 | ||||
| -rw-r--r-- | lib/std/zig/tokenizer.zig | 41 | ||||
| -rw-r--r-- | src/tokenizer.cpp | 51 | ||||
| -rw-r--r-- | test/stage1/behavior/misc.zig | 4 |
4 files changed, 81 insertions, 19 deletions
diff --git a/doc/langref.html.in b/doc/langref.html.in index 2a323d5539..a30670b0c7 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -552,8 +552,7 @@ pub fn main() void { <p> Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals - and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented, - character literals will be allowed to have a single UTF-8 encoded codepoint. + and character literals. </p> {#code_begin|test#} const assert = @import("std").debug.assert; @@ -567,6 +566,7 @@ test "string literals" { assert(normal_bytes[1] == 'e'); assert('e' == '\x65'); assert('\u{1f4a9}' == 128169); + assert('💯' == 128175); assert(mem.eql(u8, "hello", "h\x65llo")); // A C string literal is a null terminated pointer. diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index b0a6cd1122..872893a607 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -371,6 +371,7 @@ pub const Tokenizer = struct { CharLiteralUnicodeEscapeSawU, CharLiteralUnicodeEscape, CharLiteralUnicodeInvalid, + CharLiteralUnicode, CharLiteralEnd, Backslash, Equal, @@ -427,6 +428,7 @@ pub const Tokenizer = struct { .end = undefined, }; var seen_escape_digits: usize = undefined; + var remaining_code_units: usize = undefined; while (self.index < self.buffer.len) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { @@ -774,16 +776,23 @@ pub const Tokenizer = struct { '\\' => { state = State.CharLiteralBackslash; }, - '\'' => { + '\'', 0x80...0xbf, 0xf8...0xff => { result.id = Token.Id.Invalid; break; }, + 0xc0...0xdf => { // 110xxxxx + remaining_code_units = 1; + state = State.CharLiteralUnicode; + }, + 0xe0...0xef => { // 1110xxxx + remaining_code_units = 2; + state = State.CharLiteralUnicode; + }, + 0xf0...0xf7 => { // 11110xxx + remaining_code_units = 3; + state = State.CharLiteralUnicode; + }, else => { - if (c < 0x20 or c == 0x7f) { - result.id = Token.Id.Invalid; - break; - } - state = State.CharLiteralEnd; }, }, @@ -867,6 +876,19 @@ pub const Tokenizer = struct { }, }, + State.CharLiteralUnicode => switch (c) { + 0x80...0xbf => { + remaining_code_units -= 1; + if (remaining_code_units == 0) { + state = State.CharLiteralEnd; + } + }, + else => { + result.id = Token.Id.Invalid; + break; + }, + }, + State.MultilineStringLiteralLine => switch (c) { '\n' => { self.index += 1; @@ -1220,6 +1242,7 @@ pub const Tokenizer = struct { State.CharLiteralUnicodeEscape, State.CharLiteralUnicodeInvalid, State.CharLiteralEnd, + State.CharLiteralUnicode, State.StringLiteralBackslash, State.LBracketStar, State.LBracketStarC, @@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" { , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); } +test "tokenizer - char literal with unicode code point" { + testTokenize( + \\'💩' + , [_]Token.Id{.CharLiteral}); +} + test "tokenizer - float literal e exponent" { testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{ Token.Id.Identifier, diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 399597b7bc..475c284d27 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -193,6 +193,7 @@ enum TokenizeState { TokenizeStateStringEscapeUnicodeStart, TokenizeStateCharLiteral, TokenizeStateCharLiteralEnd, + TokenizeStateCharLiteralUnicode, TokenizeStateSawStar, TokenizeStateSawStarPercent, TokenizeStateSawSlash, @@ -250,6 +251,7 @@ struct Tokenize { int exponent_in_bin_or_dec; BigInt specified_exponent; BigInt significand; + size_t remaining_code_units; }; ATTRIBUTE_PRINTF(2, 3) @@ -1221,17 +1223,32 @@ void tokenize(Buf *buf, Tokenization *out) { } break; case TokenizeStateCharLiteral: - switch (c) { - case '\'': - tokenize_error(&t, "expected character"); - break; - case '\\': - t.state = TokenizeStateStringEscape; - break; - default: - t.cur_tok->data.char_lit.c = c; - t.state = TokenizeStateCharLiteralEnd; - break; + if (c == '\'') { + tokenize_error(&t, "expected character"); + } else if (c == '\\') { + t.state = TokenizeStateStringEscape; + } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) { + // 10xxxxxx + // 11111xxx + invalid_char_error(&t, c); + } else if (c >= 0xc0 && c <= 0xdf) { + // 110xxxxx + t.cur_tok->data.char_lit.c = c & 0x1f; + t.remaining_code_units = 1; + t.state = TokenizeStateCharLiteralUnicode; + } else if (c >= 0xe0 && c <= 0xef) { + // 1110xxxx + t.cur_tok->data.char_lit.c = c & 0x0f; + t.remaining_code_units = 2; + t.state = TokenizeStateCharLiteralUnicode; + } else if (c >= 0xf0 && c <= 0xf7) { + // 11110xxx + t.cur_tok->data.char_lit.c = c & 0x07; + t.remaining_code_units = 3; + t.state = TokenizeStateCharLiteralUnicode; + } else { + t.cur_tok->data.char_lit.c = c; + t.state = TokenizeStateCharLiteralEnd; } break; case TokenizeStateCharLiteralEnd: @@ -1244,6 +1261,17 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateCharLiteralUnicode: + if (c <= 0x7f || c >= 0xc0) { + invalid_char_error(&t, c); + } + t.cur_tok->data.char_lit.c <<= 6; + t.cur_tok->data.char_lit.c += c & 0x3f; + t.remaining_code_units--; + if (t.remaining_code_units == 0) { + t.state = TokenizeStateCharLiteralEnd; + } + break; case TokenizeStateZero: switch (c) { case 'b': @@ -1479,6 +1507,7 @@ void tokenize(Buf *buf, Tokenization *out) { break; case TokenizeStateCharLiteral: case TokenizeStateCharLiteralEnd: + case TokenizeStateCharLiteralUnicode: tokenize_error(&t, "unterminated character literal"); break; case TokenizeStateSymbol: diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig index ab16b08be8..65ac83bcf0 100644 --- a/test/stage1/behavior/misc.zig +++ b/test/stage1/behavior/misc.zig @@ -699,6 +699,10 @@ test "unicode escape in character literal" { expect(a == 128169); } +test "unicode character in character literal" { + expect('💩' == 128169); +} + test "result location zero sized array inside struct field implicit cast to slice" { const E = struct { entries: []u32, |
