diff options
| author | Cody Tapscott <topolarity@tapscott.me> | 2022-03-01 20:51:01 -0700 |
|---|---|---|
| committer | Andrew Kelley <andrew@ziglang.org> | 2022-03-02 14:45:19 -0500 |
| commit | 5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8 (patch) | |
| tree | 97fc20a6da874c0d808c31e92609e95bb854c973 /lib/std | |
| parent | aa867c7dbe6576f61f957667fef769030aff7c69 (diff) | |
| download | zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.tar.gz zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.zip | |
stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences
The core of this change is to re-use the escape sequence parsing logic
for parsing both string and character literals.
The actual fix is that UTF-8 encoding was missing for string literals
with \u{...} escape sequences.
Diffstat (limited to 'lib/std')
| -rw-r--r-- | lib/std/zig.zig | 203 | ||||
| -rw-r--r-- | lib/std/zig/string_literal.zig | 366 |
2 files changed, 259 insertions, 310 deletions
diff --git a/lib/std/zig.zig b/lib/std/zig.zig index 9b8e2294f2..0d3c94d37b 100644 --- a/lib/std/zig.zig +++ b/lib/std/zig.zig @@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig"); pub const system = @import("zig/system.zig"); pub const CrossTarget = @import("zig/CrossTarget.zig"); +// Character literal parsing +pub const ParsedCharLiteral = string_literal.ParsedCharLiteral; +pub const parseCharLiteral = string_literal.parseCharLiteral; + // Files needed by translate-c. pub const c_builtins = @import("zig/c_builtins.zig"); pub const c_translation = @import("zig/c_translation.zig"); @@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error } } -pub const ParsedCharLiteral = union(enum) { - success: u32, - /// The character after backslash is not recognized. - invalid_escape_character: usize, - /// Expected hex digit at this index. - expected_hex_digit: usize, - /// Unicode escape sequence had no digits with rbrace at this index. - empty_unicode_escape_sequence: usize, - /// Expected hex digit or '}' at this index. - expected_hex_digit_or_rbrace: usize, - /// The unicode point is outside the range of Unicode codepoints. - unicode_escape_overflow: usize, - /// Expected '{' at this index. - expected_lbrace: usize, - /// Expected the terminating single quote at this index. - expected_end: usize, - /// The character at this index cannot be represented without an escape sequence. - invalid_character: usize, -}; - -/// Only validates escape sequence characters. -/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between. -pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { - assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\''); - - switch (slice[1]) { - 0 => return .{ .invalid_character = 1 }, - '\\' => switch (slice[2]) { - 'n' => return .{ .success = '\n' }, - 'r' => return .{ .success = '\r' }, - '\\' => return .{ .success = '\\' }, - 't' => return .{ .success = '\t' }, - '\'' => return .{ .success = '\'' }, - '"' => return .{ .success = '"' }, - 'x' => { - if (slice.len < 4) { - return .{ .expected_hex_digit = 3 }; - } - var value: u32 = 0; - var i: usize = 3; - while (i < 5) : (i += 1) { - const c = slice[i]; - switch (c) { - '0'...'9' => { - value *= 16; - value += c - '0'; - }, - 'a'...'f' => { - value *= 16; - value += c - 'a' + 10; - }, - 'A'...'F' => { - value *= 16; - value += c - 'A' + 10; - }, - else => { - return .{ .expected_hex_digit = i }; - }, - } - } - if (slice[i] != '\'') { - return .{ .expected_end = i }; - } - return .{ .success = value }; - }, - 'u' => { - var i: usize = 3; - if (slice[i] != '{') { - return .{ .expected_lbrace = i }; - } - i += 1; - if (slice[i] == '}') { - return .{ .empty_unicode_escape_sequence = i }; - } - - var value: u32 = 0; - while (i < slice.len) : (i += 1) { - const c = slice[i]; - switch (c) { - '0'...'9' => { - value *= 16; - value += c - '0'; - }, - 'a'...'f' => { - value *= 16; - value += c - 'a' + 10; - }, - 'A'...'F' => { - value *= 16; - value += c - 'A' + 10; - }, - '}' => { - i += 1; - break; - }, - else => return .{ .expected_hex_digit_or_rbrace = i }, - } - if (value > 0x10ffff) { - return .{ .unicode_escape_overflow = i }; - } - } - if (slice[i] != '\'') { - return .{ .expected_end = i }; - } - return .{ .success = value }; - }, - else => return .{ .invalid_escape_character = 2 }, - }, - else => { - const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable; - return .{ .success = codepoint }; - }, - } -} - -test "parseCharLiteral" { - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 'a' }, - parseCharLiteral("'a'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 'ä' }, - parseCharLiteral("'ä'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0 }, - parseCharLiteral("'\\x00'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x4f }, - parseCharLiteral("'\\x4f'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x4f }, - parseCharLiteral("'\\x4F'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x3041 }, - parseCharLiteral("'ぁ'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0 }, - parseCharLiteral("'\\u{0}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x3041 }, - parseCharLiteral("'\\u{3041}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x7f }, - parseCharLiteral("'\\u{7f}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x7fff }, - parseCharLiteral("'\\u{7FFF}'"), - ); - - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_hex_digit = 4 }, - parseCharLiteral("'\\x0'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_end = 5 }, - parseCharLiteral("'\\x000'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .invalid_escape_character = 2 }, - parseCharLiteral("'\\y'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_lbrace = 3 }, - parseCharLiteral("'\\u'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_lbrace = 3 }, - parseCharLiteral("'\\uFFFF'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 }, - parseCharLiteral("'\\u{}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .unicode_escape_overflow = 9 }, - parseCharLiteral("'\\u{FFFFFF}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 }, - parseCharLiteral("'\\u{FFFF'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_end = 9 }, - parseCharLiteral("'\\u{FFFF}x'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .invalid_character = 1 }, - parseCharLiteral("'\x00'"), - ); -} - test { @import("std").testing.refAllDecls(@This()); } diff --git a/lib/std/zig/string_literal.zig b/lib/std/zig/string_literal.zig index 1eaab26e3a..07ce08f491 100644 --- a/lib/std/zig/string_literal.zig +++ b/lib/std/zig/string_literal.zig @@ -1,129 +1,268 @@ const std = @import("../std.zig"); const assert = std.debug.assert; +const utf8Decode = std.unicode.utf8Decode; +const utf8Encode = std.unicode.utf8Encode; pub const ParseError = error{ OutOfMemory, - InvalidStringLiteral, + InvalidLiteral, +}; + +pub const ParsedCharLiteral = union(enum) { + success: u21, + failure: Error, }; pub const Result = union(enum) { success, - /// Found an invalid character at this index. + failure: Error, +}; + +pub const Error = union(enum) { + /// The character after backslash is missing or not recognized. + invalid_escape_character: usize, + /// Expected hex digit at this index. + expected_hex_digit: usize, + /// Unicode escape sequence had no digits with rbrace at this index. + empty_unicode_escape_sequence: usize, + /// Expected hex digit or '}' at this index. + expected_hex_digit_or_rbrace: usize, + /// Invalid unicode codepoint at this index. + invalid_unicode_codepoint: usize, + /// Expected '{' at this index. + expected_lbrace: usize, + /// Expected '}' at this index. + expected_rbrace: usize, + /// Expected '\'' at this index. + expected_single_quote: usize, + /// The character at this index cannot be represented without an escape sequence. invalid_character: usize, - /// Expected hex digits at this index. - expected_hex_digits: usize, - /// Invalid hex digits at this index. - invalid_hex_escape: usize, - /// Invalid unicode escape at this index. - invalid_unicode_escape: usize, - /// The left brace at this index is missing a matching right brace. - missing_matching_rbrace: usize, - /// Expected unicode digits at this index. - expected_unicode_digits: usize, }; +/// Only validates escape sequence characters. +/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between. +pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { + assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\''); + + switch (slice[1]) { + '\\' => { + var offset: usize = 1; + const result = parseEscapeSequence(slice, &offset); + if (result == .success and (offset + 1 != slice.len or slice[offset] != '\'')) + return .{ .failure = .{ .expected_single_quote = offset } }; + + return result; + }, + 0 => return .{ .failure = .{ .invalid_character = 1 } }, + else => { + const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable; + return .{ .success = codepoint }; + }, + } +} + +/// Parse an escape sequence from `slice[offset..]`. If parsing is successful, +/// offset is updated to reflect the characters consumed. +fn parseEscapeSequence(slice: []const u8, offset: *usize) ParsedCharLiteral { + assert(slice.len > offset.*); + assert(slice[offset.*] == '\\'); + + if (slice.len == offset.* + 1) + return .{ .failure = .{ .invalid_escape_character = offset.* + 1 } }; + + offset.* += 2; + switch (slice[offset.* - 1]) { + 'n' => return .{ .success = '\n' }, + 'r' => return .{ .success = '\r' }, + '\\' => return .{ .success = '\\' }, + 't' => return .{ .success = '\t' }, + '\'' => return .{ .success = '\'' }, + '"' => return .{ .success = '"' }, + 'x' => { + var value: u8 = 0; + var i: usize = offset.*; + while (i < offset.* + 2) : (i += 1) { + if (i == slice.len) return .{ .failure = .{ .expected_hex_digit = i } }; + + const c = slice[i]; + switch (c) { + '0'...'9' => { + value *= 16; + value += c - '0'; + }, + 'a'...'f' => { + value *= 16; + value += c - 'a' + 10; + }, + 'A'...'F' => { + value *= 16; + value += c - 'A' + 10; + }, + else => { + return .{ .failure = .{ .expected_hex_digit = i } }; + }, + } + } + offset.* = i; + return .{ .success = value }; + }, + 'u' => { + var i: usize = offset.*; + if (i >= slice.len or slice[i] != '{') return .{ .failure = .{ .expected_lbrace = i } }; + i += 1; + if (i >= slice.len) return .{ .failure = .{ .expected_hex_digit_or_rbrace = i } }; + if (slice[i] == '}') return .{ .failure = .{ .empty_unicode_escape_sequence = i } }; + + var value: u32 = 0; + while (i < slice.len) : (i += 1) { + const c = slice[i]; + switch (c) { + '0'...'9' => { + value *= 16; + value += c - '0'; + }, + 'a'...'f' => { + value *= 16; + value += c - 'a' + 10; + }, + 'A'...'F' => { + value *= 16; + value += c - 'A' + 10; + }, + '}' => { + i += 1; + break; + }, + else => return .{ .failure = .{ .expected_hex_digit_or_rbrace = i } }, + } + if (value > 0x10ffff) { + return .{ .failure = .{ .invalid_unicode_codepoint = i } }; + } + } else { + return .{ .failure = .{ .expected_rbrace = i } }; + } + offset.* = i; + return .{ .success = @intCast(u21, value) }; + }, + else => return .{ .failure = .{ .invalid_escape_character = offset.* - 1 } }, + } +} + +test "parseCharLiteral" { + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 'a' }, + parseCharLiteral("'a'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 'ä' }, + parseCharLiteral("'ä'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0 }, + parseCharLiteral("'\\x00'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0x4f }, + parseCharLiteral("'\\x4f'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0x4f }, + parseCharLiteral("'\\x4F'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0x3041 }, + parseCharLiteral("'ぁ'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0 }, + parseCharLiteral("'\\u{0}'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0x3041 }, + parseCharLiteral("'\\u{3041}'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0x7f }, + parseCharLiteral("'\\u{7f}'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .success = 0x7fff }, + parseCharLiteral("'\\u{7FFF}'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .expected_hex_digit = 4 } }, + parseCharLiteral("'\\x0'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .expected_single_quote = 5 } }, + parseCharLiteral("'\\x000'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .invalid_escape_character = 2 } }, + parseCharLiteral("'\\y'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .expected_lbrace = 3 } }, + parseCharLiteral("'\\u'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .expected_lbrace = 3 } }, + parseCharLiteral("'\\uFFFF'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .empty_unicode_escape_sequence = 4 } }, + parseCharLiteral("'\\u{}'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .invalid_unicode_codepoint = 9 } }, + parseCharLiteral("'\\u{FFFFFF}'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .expected_hex_digit_or_rbrace = 8 } }, + parseCharLiteral("'\\u{FFFF'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .expected_single_quote = 9 } }, + parseCharLiteral("'\\u{FFFF}x'"), + ); + try std.testing.expectEqual( + ParsedCharLiteral{ .failure = .{ .invalid_character = 1 } }, + parseCharLiteral("'\x00'"), + ); +} + /// Parses `bytes` as a Zig string literal and appends the result to `buf`. /// Asserts `bytes` has '"' at beginning and end. pub fn parseAppend(buf: *std.ArrayList(u8), bytes: []const u8) error{OutOfMemory}!Result { assert(bytes.len >= 2 and bytes[0] == '"' and bytes[bytes.len - 1] == '"'); - const slice = bytes[1..]; - - const prev_len = buf.items.len; - try buf.ensureUnusedCapacity(slice.len - 1); - errdefer buf.shrinkRetainingCapacity(prev_len); - - const State = enum { - Start, - Backslash, - }; - - var state = State.Start; - var index: usize = 0; - while (true) : (index += 1) { - const b = slice[index]; - - switch (state) { - State.Start => switch (b) { - '\\' => state = State.Backslash, - '\n' => { - return Result{ .invalid_character = index }; - }, - '"' => return Result.success, - else => try buf.append(b), - }, - State.Backslash => switch (b) { - 'n' => { - try buf.append('\n'); - state = State.Start; - }, - 'r' => { - try buf.append('\r'); - state = State.Start; - }, - '\\' => { - try buf.append('\\'); - state = State.Start; - }, - 't' => { - try buf.append('\t'); - state = State.Start; - }, - '\'' => { - try buf.append('\''); - state = State.Start; - }, - '"' => { - try buf.append('"'); - state = State.Start; - }, - 'x' => { - // TODO: add more/better/broader tests for this. - const index_continue = index + 3; - if (slice.len < index_continue) { - return Result{ .expected_hex_digits = index }; - } - if (std.fmt.parseUnsigned(u8, slice[index + 1 .. index_continue], 16)) |byte| { - try buf.append(byte); - state = State.Start; - index = index_continue - 1; // loop-header increments again - } else |err| switch (err) { - error.Overflow => unreachable, // 2 digits base 16 fits in a u8. - error.InvalidCharacter => { - return Result{ .invalid_hex_escape = index + 1 }; - }, - } - }, - 'u' => { - // TODO: add more/better/broader tests for this. - // TODO: we are already inside a nice, clean state machine... use it - // instead of this hacky code. - if (slice.len > index + 2 and slice[index + 1] == '{') { - if (std.mem.indexOfScalarPos(u8, slice[0..std.math.min(index + 9, slice.len)], index + 3, '}')) |index_end| { - const hex_str = slice[index + 2 .. index_end]; - if (std.fmt.parseUnsigned(u32, hex_str, 16)) |uint| { - if (uint <= 0x10ffff) { - // TODO this incorrectly depends on endianness - try buf.appendSlice(std.mem.toBytes(uint)[0..]); - state = State.Start; - index = index_end; // loop-header increments - continue; - } - } else |err| switch (err) { - error.Overflow => unreachable, - error.InvalidCharacter => { - return Result{ .invalid_unicode_escape = index + 1 }; - }, - } + try buf.ensureUnusedCapacity(bytes.len - 2); + + var index: usize = 1; + while (true) { + const b = bytes[index]; + + switch (b) { + '\\' => { + const escape_char_index = index + 1; + const result = parseEscapeSequence(bytes, &index); + switch (result) { + .success => |codepoint| { + if (bytes[escape_char_index] == 'u') { + buf.items.len += utf8Encode(codepoint, buf.unusedCapacitySlice()) catch { + return Result{ .failure = .{ .invalid_unicode_codepoint = escape_char_index + 1 } }; + }; } else { - return Result{ .missing_matching_rbrace = index + 1 }; + buf.appendAssumeCapacity(@intCast(u8, codepoint)); } - } else { - return Result{ .expected_unicode_digits = index }; - } - }, - else => { - return Result{ .invalid_character = index }; - }, + }, + .failure => |err| return Result{ .failure = err }, + } + }, + '\n' => return Result{ .failure = .{ .invalid_character = index } }, + '"' => return Result.success, + else => { + try buf.append(b); + index += 1; }, } } else unreachable; // TODO should not need else unreachable on while(true) @@ -137,18 +276,23 @@ pub fn parseAlloc(allocator: std.mem.Allocator, bytes: []const u8) ParseError![] switch (try parseAppend(&buf, bytes)) { .success => return buf.toOwnedSlice(), - else => return error.InvalidStringLiteral, + .failure => return error.InvalidLiteral, } } test "parse" { const expect = std.testing.expect; + const expectError = std.testing.expectError; const eql = std.mem.eql; - var fixed_buf_mem: [32]u8 = undefined; - var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(fixed_buf_mem[0..]); + var fixed_buf_mem: [64]u8 = undefined; + var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(&fixed_buf_mem); var alloc = fixed_buf_alloc.allocator(); + try expectError(error.InvalidLiteral, parseAlloc(alloc, "\"\\x6\"")); + try expect(eql(u8, "foo\nbar", try parseAlloc(alloc, "\"foo\\nbar\""))); + try expect(eql(u8, "\x12foo", try parseAlloc(alloc, "\"\\x12foo\""))); + try expect(eql(u8, "bytes\u{1234}foo", try parseAlloc(alloc, "\"bytes\\u{1234}foo\""))); try expect(eql(u8, "foo", try parseAlloc(alloc, "\"foo\""))); try expect(eql(u8, "foo", try parseAlloc(alloc, "\"f\x6f\x6f\""))); try expect(eql(u8, "f💯", try parseAlloc(alloc, "\"f\u{1f4af}\""))); |
