diff options
Diffstat (limited to 'src/parser.cpp')
| -rw-r--r-- | src/parser.cpp | 76 |
1 files changed, 69 insertions, 7 deletions
diff --git a/src/parser.cpp b/src/parser.cpp index 62f83fd080..04434f82e6 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) { return return_value; } -static int get_hex_digit(uint8_t c) { +static uint32_t get_hex_digit(uint8_t c) { switch (c) { case '0': return 0; case '1': return 1; @@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) { case 'F': return 15; default: - return -1; + return UINT32_MAX; } } @@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool StateEscape, StateHex1, StateHex2, + StateUnicode, }; buf_resize(buf, 0); + int unicode_index; + int unicode_end; + State state = StatePre; SrcPos pos = {token->start_line, token->start_column}; - int hex_value = 0; + uint32_t hex_value = 0; for (int i = token->start_pos; i < token->end_pos - 1; i += 1) { uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i); @@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool if (offset_map) offset_map->append(pos); state = StateStart; break; + case '\'': + buf_append_char(buf, '\''); + if (offset_map) offset_map->append(pos); + state = StateStart; + break; case 'x': state = StateHex1; break; + case 'u': + state = StateUnicode; + unicode_index = 0; + unicode_end = 4; + hex_value = 0; + break; + case 'U': + state = StateUnicode; + unicode_index = 0; + unicode_end = 6; + hex_value = 0; + break; default: ast_error(pc, token, "invalid escape character"); } break; case StateHex1: { - int hex_digit = get_hex_digit(c); - if (hex_digit == -1) { + uint32_t hex_digit = get_hex_digit(c); + if (hex_digit == UINT32_MAX) { ast_error(pc, token, "invalid hex digit: '%c'", c); } hex_value = hex_digit * 16; @@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool } case StateHex2: { - int hex_digit = get_hex_digit(c); - if (hex_digit == -1) { + uint32_t hex_digit = get_hex_digit(c); + if (hex_digit == UINT32_MAX) { ast_error(pc, token, "invalid hex digit: '%c'", c); } hex_value += hex_digit; @@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool state = StateStart; break; } + case StateUnicode: + { + uint32_t hex_digit = get_hex_digit(c); + if (hex_digit == UINT32_MAX) { + ast_error(pc, token, "invalid hex digit: '%c'", c); + } + hex_value *= 16; + hex_value += hex_digit; + unicode_index += 1; + if (unicode_index >= unicode_end) { + if (hex_value <= 0x7f) { + // 00000000 00000000 00000000 0xxxxxxx + buf_append_char(buf, hex_value); + } else if (hex_value <= 0x7ff) { + // 00000000 00000000 00000xxx xx000000 + buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6))); + // 00000000 00000000 00000000 00xxxxxx + buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); + } else if (hex_value <= 0xffff) { + // 00000000 00000000 xxxx0000 00000000 + buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12))); + // 00000000 00000000 0000xxxx xx000000 + buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); + } else if (hex_value <= 0x10ffff) { + // 00000000 000xxx00 00000000 00000000 + buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18))); + // 00000000 000000xx xxxx0000 00000000 + buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f))); + // 00000000 00000000 0000xxxx xx000000 + buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f))); + } else { + ast_error(pc, token, "unicode value out of range: %x", hex_value); + } + state = StateStart; + } + break; + } } if (c == '\n') { pos.line += 1; |
