aboutsummaryrefslogtreecommitdiff
path: root/lib/std/zig.zig
diff options
context:
space:
mode:
authorCody Tapscott <topolarity@tapscott.me>2022-03-01 20:51:01 -0700
committerAndrew Kelley <andrew@ziglang.org>2022-03-02 14:45:19 -0500
commit5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8 (patch)
tree97fc20a6da874c0d808c31e92609e95bb854c973 /lib/std/zig.zig
parentaa867c7dbe6576f61f957667fef769030aff7c69 (diff)
downloadzig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.tar.gz
zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.zip
stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences
The core of this change is to re-use the escape sequence parsing logic for parsing both string and character literals. The actual fix is that UTF-8 encoding was missing for string literals with \u{...} escape sequences.
Diffstat (limited to 'lib/std/zig.zig')
-rw-r--r--lib/std/zig.zig203
1 files changed, 4 insertions, 199 deletions
diff --git a/lib/std/zig.zig b/lib/std/zig.zig
index 9b8e2294f2..0d3c94d37b 100644
--- a/lib/std/zig.zig
+++ b/lib/std/zig.zig
@@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig");
pub const system = @import("zig/system.zig");
pub const CrossTarget = @import("zig/CrossTarget.zig");
+// Character literal parsing
+pub const ParsedCharLiteral = string_literal.ParsedCharLiteral;
+pub const parseCharLiteral = string_literal.parseCharLiteral;
+
// Files needed by translate-c.
pub const c_builtins = @import("zig/c_builtins.zig");
pub const c_translation = @import("zig/c_translation.zig");
@@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error
}
}
-pub const ParsedCharLiteral = union(enum) {
- success: u32,
- /// The character after backslash is not recognized.
- invalid_escape_character: usize,
- /// Expected hex digit at this index.
- expected_hex_digit: usize,
- /// Unicode escape sequence had no digits with rbrace at this index.
- empty_unicode_escape_sequence: usize,
- /// Expected hex digit or '}' at this index.
- expected_hex_digit_or_rbrace: usize,
- /// The unicode point is outside the range of Unicode codepoints.
- unicode_escape_overflow: usize,
- /// Expected '{' at this index.
- expected_lbrace: usize,
- /// Expected the terminating single quote at this index.
- expected_end: usize,
- /// The character at this index cannot be represented without an escape sequence.
- invalid_character: usize,
-};
-
-/// Only validates escape sequence characters.
-/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
-pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
- assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
-
- switch (slice[1]) {
- 0 => return .{ .invalid_character = 1 },
- '\\' => switch (slice[2]) {
- 'n' => return .{ .success = '\n' },
- 'r' => return .{ .success = '\r' },
- '\\' => return .{ .success = '\\' },
- 't' => return .{ .success = '\t' },
- '\'' => return .{ .success = '\'' },
- '"' => return .{ .success = '"' },
- 'x' => {
- if (slice.len < 4) {
- return .{ .expected_hex_digit = 3 };
- }
- var value: u32 = 0;
- var i: usize = 3;
- while (i < 5) : (i += 1) {
- const c = slice[i];
- switch (c) {
- '0'...'9' => {
- value *= 16;
- value += c - '0';
- },
- 'a'...'f' => {
- value *= 16;
- value += c - 'a' + 10;
- },
- 'A'...'F' => {
- value *= 16;
- value += c - 'A' + 10;
- },
- else => {
- return .{ .expected_hex_digit = i };
- },
- }
- }
- if (slice[i] != '\'') {
- return .{ .expected_end = i };
- }
- return .{ .success = value };
- },
- 'u' => {
- var i: usize = 3;
- if (slice[i] != '{') {
- return .{ .expected_lbrace = i };
- }
- i += 1;
- if (slice[i] == '}') {
- return .{ .empty_unicode_escape_sequence = i };
- }
-
- var value: u32 = 0;
- while (i < slice.len) : (i += 1) {
- const c = slice[i];
- switch (c) {
- '0'...'9' => {
- value *= 16;
- value += c - '0';
- },
- 'a'...'f' => {
- value *= 16;
- value += c - 'a' + 10;
- },
- 'A'...'F' => {
- value *= 16;
- value += c - 'A' + 10;
- },
- '}' => {
- i += 1;
- break;
- },
- else => return .{ .expected_hex_digit_or_rbrace = i },
- }
- if (value > 0x10ffff) {
- return .{ .unicode_escape_overflow = i };
- }
- }
- if (slice[i] != '\'') {
- return .{ .expected_end = i };
- }
- return .{ .success = value };
- },
- else => return .{ .invalid_escape_character = 2 },
- },
- else => {
- const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
- return .{ .success = codepoint };
- },
- }
-}
-
-test "parseCharLiteral" {
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 'a' },
- parseCharLiteral("'a'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 'ä' },
- parseCharLiteral("'ä'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0 },
- parseCharLiteral("'\\x00'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x4f },
- parseCharLiteral("'\\x4f'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x4f },
- parseCharLiteral("'\\x4F'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x3041 },
- parseCharLiteral("'ぁ'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0 },
- parseCharLiteral("'\\u{0}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x3041 },
- parseCharLiteral("'\\u{3041}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x7f },
- parseCharLiteral("'\\u{7f}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x7fff },
- parseCharLiteral("'\\u{7FFF}'"),
- );
-
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_hex_digit = 4 },
- parseCharLiteral("'\\x0'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_end = 5 },
- parseCharLiteral("'\\x000'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .invalid_escape_character = 2 },
- parseCharLiteral("'\\y'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_lbrace = 3 },
- parseCharLiteral("'\\u'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_lbrace = 3 },
- parseCharLiteral("'\\uFFFF'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 },
- parseCharLiteral("'\\u{}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .unicode_escape_overflow = 9 },
- parseCharLiteral("'\\u{FFFFFF}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 },
- parseCharLiteral("'\\u{FFFF'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_end = 9 },
- parseCharLiteral("'\\u{FFFF}x'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .invalid_character = 1 },
- parseCharLiteral("'\x00'"),
- );
-}
-
test {
@import("std").testing.refAllDecls(@This());
}