aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer.cpp
diff options
context:
space:
mode:
authorNick Erdmann <n@nirf.de>2019-10-06 19:52:35 +0200
committerNick Erdmann <n@nirf.de>2019-10-07 08:18:16 +0200
commitae7392e504e7765b05d98636cc249cbf92233f5c (patch)
tree42fa9e83943c41a4e84e0152b1deb427dfbdc292 /src/tokenizer.cpp
parent571123465b2e030b7b9cf42732ed30f77192fbcd (diff)
downloadzig-ae7392e504e7765b05d98636cc249cbf92233f5c.tar.gz
zig-ae7392e504e7765b05d98636cc249cbf92233f5c.zip
unicode character literals
Diffstat (limited to 'src/tokenizer.cpp')
-rw-r--r--src/tokenizer.cpp51
1 files changed, 40 insertions, 11 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 71a24fe726..11824bd871 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -193,6 +193,7 @@ enum TokenizeState {
TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd,
+ TokenizeStateCharLiteralUnicode,
TokenizeStateSawStar,
TokenizeStateSawStarPercent,
TokenizeStateSawSlash,
@@ -247,6 +248,7 @@ struct Tokenize {
int exponent_in_bin_or_dec;
BigInt specified_exponent;
BigInt significand;
+ size_t remaining_code_units;
};
ATTRIBUTE_PRINTF(2, 3)
@@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
}
break;
case TokenizeStateCharLiteral:
- switch (c) {
- case '\'':
- tokenize_error(&t, "expected character");
- break;
- case '\\':
- t.state = TokenizeStateStringEscape;
- break;
- default:
- t.cur_tok->data.char_lit.c = c;
- t.state = TokenizeStateCharLiteralEnd;
- break;
+ if (c == '\'') {
+ tokenize_error(&t, "expected character");
+ } else if (c == '\\') {
+ t.state = TokenizeStateStringEscape;
+ } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
+ // 10xxxxxx
+ // 11111xxx
+ invalid_char_error(&t, c);
+ } else if (c >= 0xc0 && c <= 0xdf) {
+ // 110xxxxx
+ t.cur_tok->data.char_lit.c = c & 0x1f;
+ t.remaining_code_units = 1;
+ t.state = TokenizeStateCharLiteralUnicode;
+ } else if (c >= 0xe0 && c <= 0xef) {
+ // 1110xxxx
+ t.cur_tok->data.char_lit.c = c & 0x0f;
+ t.remaining_code_units = 2;
+ t.state = TokenizeStateCharLiteralUnicode;
+ } else if (c >= 0xf0 && c <= 0xf7) {
+ // 11110xxx
+ t.cur_tok->data.char_lit.c = c & 0x07;
+ t.remaining_code_units = 3;
+ t.state = TokenizeStateCharLiteralUnicode;
+ } else {
+ t.cur_tok->data.char_lit.c = c;
+ t.state = TokenizeStateCharLiteralEnd;
}
break;
case TokenizeStateCharLiteralEnd:
@@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
+ case TokenizeStateCharLiteralUnicode:
+ if (c <= 0x7f || c >= 0xc0) {
+ invalid_char_error(&t, c);
+ }
+ t.cur_tok->data.char_lit.c <<= 6;
+ t.cur_tok->data.char_lit.c += c & 0x3f;
+ t.remaining_code_units--;
+ if (t.remaining_code_units == 0) {
+ t.state = TokenizeStateCharLiteralEnd;
+ }
+ break;
case TokenizeStateZero:
switch (c) {
case 'b':
@@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEnd:
+ case TokenizeStateCharLiteralUnicode:
tokenize_error(&t, "unterminated character literal");
break;
case TokenizeStateSymbol: