diff options
| author | Nick Erdmann <n@nirf.de> | 2019-10-06 19:52:35 +0200 |
|---|---|---|
| committer | Nick Erdmann <n@nirf.de> | 2019-10-07 08:18:16 +0200 |
| commit | ae7392e504e7765b05d98636cc249cbf92233f5c (patch) | |
| tree | 42fa9e83943c41a4e84e0152b1deb427dfbdc292 /src/tokenizer.cpp | |
| parent | 571123465b2e030b7b9cf42732ed30f77192fbcd (diff) | |
| download | zig-ae7392e504e7765b05d98636cc249cbf92233f5c.tar.gz zig-ae7392e504e7765b05d98636cc249cbf92233f5c.zip | |
unicode character literals
Diffstat (limited to 'src/tokenizer.cpp')
| -rw-r--r-- | src/tokenizer.cpp | 51 |
1 files changed, 40 insertions, 11 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 71a24fe726..11824bd871 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -193,6 +193,7 @@ enum TokenizeState { TokenizeStateStringEscapeUnicodeStart, TokenizeStateCharLiteral, TokenizeStateCharLiteralEnd, + TokenizeStateCharLiteralUnicode, TokenizeStateSawStar, TokenizeStateSawStarPercent, TokenizeStateSawSlash, @@ -247,6 +248,7 @@ struct Tokenize { int exponent_in_bin_or_dec; BigInt specified_exponent; BigInt significand; + size_t remaining_code_units; }; ATTRIBUTE_PRINTF(2, 3) @@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) { } break; case TokenizeStateCharLiteral: - switch (c) { - case '\'': - tokenize_error(&t, "expected character"); - break; - case '\\': - t.state = TokenizeStateStringEscape; - break; - default: - t.cur_tok->data.char_lit.c = c; - t.state = TokenizeStateCharLiteralEnd; - break; + if (c == '\'') { + tokenize_error(&t, "expected character"); + } else if (c == '\\') { + t.state = TokenizeStateStringEscape; + } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) { + // 10xxxxxx + // 11111xxx + invalid_char_error(&t, c); + } else if (c >= 0xc0 && c <= 0xdf) { + // 110xxxxx + t.cur_tok->data.char_lit.c = c & 0x1f; + t.remaining_code_units = 1; + t.state = TokenizeStateCharLiteralUnicode; + } else if (c >= 0xe0 && c <= 0xef) { + // 1110xxxx + t.cur_tok->data.char_lit.c = c & 0x0f; + t.remaining_code_units = 2; + t.state = TokenizeStateCharLiteralUnicode; + } else if (c >= 0xf0 && c <= 0xf7) { + // 11110xxx + t.cur_tok->data.char_lit.c = c & 0x07; + t.remaining_code_units = 3; + t.state = TokenizeStateCharLiteralUnicode; + } else { + t.cur_tok->data.char_lit.c = c; + t.state = TokenizeStateCharLiteralEnd; } break; case TokenizeStateCharLiteralEnd: @@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateCharLiteralUnicode: + if (c <= 0x7f || c >= 0xc0) { + invalid_char_error(&t, c); + } + t.cur_tok->data.char_lit.c <<= 6; + t.cur_tok->data.char_lit.c += c & 0x3f; + t.remaining_code_units--; + if (t.remaining_code_units == 0) { + t.state = TokenizeStateCharLiteralEnd; + } + break; case TokenizeStateZero: switch (c) { case 'b': @@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) { break; case TokenizeStateCharLiteral: case TokenizeStateCharLiteralEnd: + case TokenizeStateCharLiteralUnicode: tokenize_error(&t, "unterminated character literal"); break; case TokenizeStateSymbol: |
