diff options
| author | Cody Tapscott <topolarity@tapscott.me> | 2022-03-01 20:51:01 -0700 |
|---|---|---|
| committer | Andrew Kelley <andrew@ziglang.org> | 2022-03-02 14:45:19 -0500 |
| commit | 5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8 (patch) | |
| tree | 97fc20a6da874c0d808c31e92609e95bb854c973 /lib/std/zig.zig | |
| parent | aa867c7dbe6576f61f957667fef769030aff7c69 (diff) | |
| download | zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.tar.gz zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.zip | |
stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences
The core of this change is to re-use the escape sequence parsing logic
for parsing both string and character literals.
The actual fix is that UTF-8 encoding was missing for string literals
with \u{...} escape sequences.
Diffstat (limited to 'lib/std/zig.zig')
| -rw-r--r-- | lib/std/zig.zig | 203 |
1 files changed, 4 insertions, 199 deletions
diff --git a/lib/std/zig.zig b/lib/std/zig.zig index 9b8e2294f2..0d3c94d37b 100644 --- a/lib/std/zig.zig +++ b/lib/std/zig.zig @@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig"); pub const system = @import("zig/system.zig"); pub const CrossTarget = @import("zig/CrossTarget.zig"); +// Character literal parsing +pub const ParsedCharLiteral = string_literal.ParsedCharLiteral; +pub const parseCharLiteral = string_literal.parseCharLiteral; + // Files needed by translate-c. pub const c_builtins = @import("zig/c_builtins.zig"); pub const c_translation = @import("zig/c_translation.zig"); @@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error } } -pub const ParsedCharLiteral = union(enum) { - success: u32, - /// The character after backslash is not recognized. - invalid_escape_character: usize, - /// Expected hex digit at this index. - expected_hex_digit: usize, - /// Unicode escape sequence had no digits with rbrace at this index. - empty_unicode_escape_sequence: usize, - /// Expected hex digit or '}' at this index. - expected_hex_digit_or_rbrace: usize, - /// The unicode point is outside the range of Unicode codepoints. - unicode_escape_overflow: usize, - /// Expected '{' at this index. - expected_lbrace: usize, - /// Expected the terminating single quote at this index. - expected_end: usize, - /// The character at this index cannot be represented without an escape sequence. - invalid_character: usize, -}; - -/// Only validates escape sequence characters. -/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between. -pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { - assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\''); - - switch (slice[1]) { - 0 => return .{ .invalid_character = 1 }, - '\\' => switch (slice[2]) { - 'n' => return .{ .success = '\n' }, - 'r' => return .{ .success = '\r' }, - '\\' => return .{ .success = '\\' }, - 't' => return .{ .success = '\t' }, - '\'' => return .{ .success = '\'' }, - '"' => return .{ .success = '"' }, - 'x' => { - if (slice.len < 4) { - return .{ .expected_hex_digit = 3 }; - } - var value: u32 = 0; - var i: usize = 3; - while (i < 5) : (i += 1) { - const c = slice[i]; - switch (c) { - '0'...'9' => { - value *= 16; - value += c - '0'; - }, - 'a'...'f' => { - value *= 16; - value += c - 'a' + 10; - }, - 'A'...'F' => { - value *= 16; - value += c - 'A' + 10; - }, - else => { - return .{ .expected_hex_digit = i }; - }, - } - } - if (slice[i] != '\'') { - return .{ .expected_end = i }; - } - return .{ .success = value }; - }, - 'u' => { - var i: usize = 3; - if (slice[i] != '{') { - return .{ .expected_lbrace = i }; - } - i += 1; - if (slice[i] == '}') { - return .{ .empty_unicode_escape_sequence = i }; - } - - var value: u32 = 0; - while (i < slice.len) : (i += 1) { - const c = slice[i]; - switch (c) { - '0'...'9' => { - value *= 16; - value += c - '0'; - }, - 'a'...'f' => { - value *= 16; - value += c - 'a' + 10; - }, - 'A'...'F' => { - value *= 16; - value += c - 'A' + 10; - }, - '}' => { - i += 1; - break; - }, - else => return .{ .expected_hex_digit_or_rbrace = i }, - } - if (value > 0x10ffff) { - return .{ .unicode_escape_overflow = i }; - } - } - if (slice[i] != '\'') { - return .{ .expected_end = i }; - } - return .{ .success = value }; - }, - else => return .{ .invalid_escape_character = 2 }, - }, - else => { - const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable; - return .{ .success = codepoint }; - }, - } -} - -test "parseCharLiteral" { - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 'a' }, - parseCharLiteral("'a'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 'ä' }, - parseCharLiteral("'ä'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0 }, - parseCharLiteral("'\\x00'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x4f }, - parseCharLiteral("'\\x4f'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x4f }, - parseCharLiteral("'\\x4F'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x3041 }, - parseCharLiteral("'ぁ'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0 }, - parseCharLiteral("'\\u{0}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x3041 }, - parseCharLiteral("'\\u{3041}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x7f }, - parseCharLiteral("'\\u{7f}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .success = 0x7fff }, - parseCharLiteral("'\\u{7FFF}'"), - ); - - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_hex_digit = 4 }, - parseCharLiteral("'\\x0'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_end = 5 }, - parseCharLiteral("'\\x000'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .invalid_escape_character = 2 }, - parseCharLiteral("'\\y'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_lbrace = 3 }, - parseCharLiteral("'\\u'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_lbrace = 3 }, - parseCharLiteral("'\\uFFFF'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 }, - parseCharLiteral("'\\u{}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .unicode_escape_overflow = 9 }, - parseCharLiteral("'\\u{FFFFFF}'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 }, - parseCharLiteral("'\\u{FFFF'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .expected_end = 9 }, - parseCharLiteral("'\\u{FFFF}x'"), - ); - try std.testing.expectEqual( - ParsedCharLiteral{ .invalid_character = 1 }, - parseCharLiteral("'\x00'"), - ); -} - test { @import("std").testing.refAllDecls(@This()); } |
