stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences

The core of this change is to re-use the escape sequence parsing logic for parsing both string and character literals. The actual fix is that UTF-8 encoding was missing for string literals with \u{...} escape sequences.
author: Cody Tapscott <topolarity@tapscott.me> 2022-03-01 20:51:01 -0700
committer: Andrew Kelley <andrew@ziglang.org> 2022-03-02 14:45:19 -0500
commit: 5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8 (patch)
tree: 97fc20a6da874c0d808c31e92609e95bb854c973 /lib/std/zig.zig
parent: aa867c7dbe6576f61f957667fef769030aff7c69 (diff)
download: zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.tar.gz
zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.zip
1 files changed, 4 insertions, 199 deletions
diff --git a/lib/std/zig.zig b/lib/std/zig.zig
index 9b8e2294f2..0d3c94d37b 100644
--- a/lib/std/zig.zig
+++ b/lib/std/zig.zig
@@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig");
 pub const system = @import("zig/system.zig");
 pub const CrossTarget = @import("zig/CrossTarget.zig");
 
+// Character literal parsing
+pub const ParsedCharLiteral = string_literal.ParsedCharLiteral;
+pub const parseCharLiteral = string_literal.parseCharLiteral;
+
 // Files needed by translate-c.
 pub const c_builtins = @import("zig/c_builtins.zig");
 pub const c_translation = @import("zig/c_translation.zig");
@@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error
     }
 }
 
-pub const ParsedCharLiteral = union(enum) {
-    success: u32,
-    /// The character after backslash is not recognized.
-    invalid_escape_character: usize,
-    /// Expected hex digit at this index.
-    expected_hex_digit: usize,
-    /// Unicode escape sequence had no digits with rbrace at this index.
-    empty_unicode_escape_sequence: usize,
-    /// Expected hex digit or '}' at this index.
-    expected_hex_digit_or_rbrace: usize,
-    /// The unicode point is outside the range of Unicode codepoints.
-    unicode_escape_overflow: usize,
-    /// Expected '{' at this index.
-    expected_lbrace: usize,
-    /// Expected the terminating single quote at this index.
-    expected_end: usize,
-    /// The character at this index cannot be represented without an escape sequence.
-    invalid_character: usize,
-};
-
-/// Only validates escape sequence characters.
-/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
-pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
-    assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
-
-    switch (slice[1]) {
-        0 => return .{ .invalid_character = 1 },
-        '\\' => switch (slice[2]) {
-            'n' => return .{ .success = '\n' },
-            'r' => return .{ .success = '\r' },
-            '\\' => return .{ .success = '\\' },
-            't' => return .{ .success = '\t' },
-            '\'' => return .{ .success = '\'' },
-            '"' => return .{ .success = '"' },
-            'x' => {
-                if (slice.len < 4) {
-                    return .{ .expected_hex_digit = 3 };
-                }
-                var value: u32 = 0;
-                var i: usize = 3;
-                while (i < 5) : (i += 1) {
-                    const c = slice[i];
-                    switch (c) {
-                        '0'...'9' => {
-                            value *= 16;
-                            value += c - '0';
-                        },
-                        'a'...'f' => {
-                            value *= 16;
-                            value += c - 'a' + 10;
-                        },
-                        'A'...'F' => {
-                            value *= 16;
-                            value += c - 'A' + 10;
-                        },
-                        else => {
-                            return .{ .expected_hex_digit = i };
-                        },
-                    }
-                }
-                if (slice[i] != '\'') {
-                    return .{ .expected_end = i };
-                }
-                return .{ .success = value };
-            },
-            'u' => {
-                var i: usize = 3;
-                if (slice[i] != '{') {
-                    return .{ .expected_lbrace = i };
-                }
-                i += 1;
-                if (slice[i] == '}') {
-                    return .{ .empty_unicode_escape_sequence = i };
-                }
-
-                var value: u32 = 0;
-                while (i < slice.len) : (i += 1) {
-                    const c = slice[i];
-                    switch (c) {
-                        '0'...'9' => {
-                            value *= 16;
-                            value += c - '0';
-                        },
-                        'a'...'f' => {
-                            value *= 16;
-                            value += c - 'a' + 10;
-                        },
-                        'A'...'F' => {
-                            value *= 16;
-                            value += c - 'A' + 10;
-                        },
-                        '}' => {
-                            i += 1;
-                            break;
-                        },
-                        else => return .{ .expected_hex_digit_or_rbrace = i },
-                    }
-                    if (value > 0x10ffff) {
-                        return .{ .unicode_escape_overflow = i };
-                    }
-                }
-                if (slice[i] != '\'') {
-                    return .{ .expected_end = i };
-                }
-                return .{ .success = value };
-            },
-            else => return .{ .invalid_escape_character = 2 },
-        },
-        else => {
-            const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
-            return .{ .success = codepoint };
-        },
-    }
-}
-
-test "parseCharLiteral" {
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 'a' },
-        parseCharLiteral("'a'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 'ä' },
-        parseCharLiteral("'ä'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0 },
-        parseCharLiteral("'\\x00'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0x4f },
-        parseCharLiteral("'\\x4f'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0x4f },
-        parseCharLiteral("'\\x4F'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0x3041 },
-        parseCharLiteral("'ぁ'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0 },
-        parseCharLiteral("'\\u{0}'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0x3041 },
-        parseCharLiteral("'\\u{3041}'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0x7f },
-        parseCharLiteral("'\\u{7f}'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .success = 0x7fff },
-        parseCharLiteral("'\\u{7FFF}'"),
-    );
-
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .expected_hex_digit = 4 },
-        parseCharLiteral("'\\x0'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .expected_end = 5 },
-        parseCharLiteral("'\\x000'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .invalid_escape_character = 2 },
-        parseCharLiteral("'\\y'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .expected_lbrace = 3 },
-        parseCharLiteral("'\\u'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .expected_lbrace = 3 },
-        parseCharLiteral("'\\uFFFF'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 },
-        parseCharLiteral("'\\u{}'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .unicode_escape_overflow = 9 },
-        parseCharLiteral("'\\u{FFFFFF}'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 },
-        parseCharLiteral("'\\u{FFFF'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .expected_end = 9 },
-        parseCharLiteral("'\\u{FFFF}x'"),
-    );
-    try std.testing.expectEqual(
-        ParsedCharLiteral{ .invalid_character = 1 },
-        parseCharLiteral("'\x00'"),
-    );
-}
-
 test {
     @import("std").testing.refAllDecls(@This());
 }
author	Cody Tapscott <topolarity@tapscott.me>	2022-03-01 20:51:01 -0700
committer	Andrew Kelley <andrew@ziglang.org>	2022-03-02 14:45:19 -0500
commit	5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8 (patch)
tree	97fc20a6da874c0d808c31e92609e95bb854c973 /lib/std/zig.zig
parent	aa867c7dbe6576f61f957667fef769030aff7c69 (diff)
download	zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.tar.gz zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.zip