aboutsummaryrefslogtreecommitdiff
path: root/lib/std
diff options
context:
space:
mode:
authorCody Tapscott <topolarity@tapscott.me>2022-03-01 20:51:01 -0700
committerAndrew Kelley <andrew@ziglang.org>2022-03-02 14:45:19 -0500
commit5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8 (patch)
tree97fc20a6da874c0d808c31e92609e95bb854c973 /lib/std
parentaa867c7dbe6576f61f957667fef769030aff7c69 (diff)
downloadzig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.tar.gz
zig-5c8a507e7a8e2e58a0ca855689bcd2edd2ab6ab8.zip
stage2 parser: UTF-8 encode \u{NNNNNN} escape sequences
The core of this change is to re-use the escape sequence parsing logic for parsing both string and character literals. The actual fix is that UTF-8 encoding was missing for string literals with \u{...} escape sequences.
Diffstat (limited to 'lib/std')
-rw-r--r--lib/std/zig.zig203
-rw-r--r--lib/std/zig/string_literal.zig366
2 files changed, 259 insertions, 310 deletions
diff --git a/lib/std/zig.zig b/lib/std/zig.zig
index 9b8e2294f2..0d3c94d37b 100644
--- a/lib/std/zig.zig
+++ b/lib/std/zig.zig
@@ -14,6 +14,10 @@ pub const Ast = @import("zig/Ast.zig");
pub const system = @import("zig/system.zig");
pub const CrossTarget = @import("zig/CrossTarget.zig");
+// Character literal parsing
+pub const ParsedCharLiteral = string_literal.ParsedCharLiteral;
+pub const parseCharLiteral = string_literal.parseCharLiteral;
+
// Files needed by translate-c.
pub const c_builtins = @import("zig/c_builtins.zig");
pub const c_translation = @import("zig/c_translation.zig");
@@ -185,205 +189,6 @@ pub fn binNameAlloc(allocator: std.mem.Allocator, options: BinNameOptions) error
}
}
-pub const ParsedCharLiteral = union(enum) {
- success: u32,
- /// The character after backslash is not recognized.
- invalid_escape_character: usize,
- /// Expected hex digit at this index.
- expected_hex_digit: usize,
- /// Unicode escape sequence had no digits with rbrace at this index.
- empty_unicode_escape_sequence: usize,
- /// Expected hex digit or '}' at this index.
- expected_hex_digit_or_rbrace: usize,
- /// The unicode point is outside the range of Unicode codepoints.
- unicode_escape_overflow: usize,
- /// Expected '{' at this index.
- expected_lbrace: usize,
- /// Expected the terminating single quote at this index.
- expected_end: usize,
- /// The character at this index cannot be represented without an escape sequence.
- invalid_character: usize,
-};
-
-/// Only validates escape sequence characters.
-/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
-pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
- assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
-
- switch (slice[1]) {
- 0 => return .{ .invalid_character = 1 },
- '\\' => switch (slice[2]) {
- 'n' => return .{ .success = '\n' },
- 'r' => return .{ .success = '\r' },
- '\\' => return .{ .success = '\\' },
- 't' => return .{ .success = '\t' },
- '\'' => return .{ .success = '\'' },
- '"' => return .{ .success = '"' },
- 'x' => {
- if (slice.len < 4) {
- return .{ .expected_hex_digit = 3 };
- }
- var value: u32 = 0;
- var i: usize = 3;
- while (i < 5) : (i += 1) {
- const c = slice[i];
- switch (c) {
- '0'...'9' => {
- value *= 16;
- value += c - '0';
- },
- 'a'...'f' => {
- value *= 16;
- value += c - 'a' + 10;
- },
- 'A'...'F' => {
- value *= 16;
- value += c - 'A' + 10;
- },
- else => {
- return .{ .expected_hex_digit = i };
- },
- }
- }
- if (slice[i] != '\'') {
- return .{ .expected_end = i };
- }
- return .{ .success = value };
- },
- 'u' => {
- var i: usize = 3;
- if (slice[i] != '{') {
- return .{ .expected_lbrace = i };
- }
- i += 1;
- if (slice[i] == '}') {
- return .{ .empty_unicode_escape_sequence = i };
- }
-
- var value: u32 = 0;
- while (i < slice.len) : (i += 1) {
- const c = slice[i];
- switch (c) {
- '0'...'9' => {
- value *= 16;
- value += c - '0';
- },
- 'a'...'f' => {
- value *= 16;
- value += c - 'a' + 10;
- },
- 'A'...'F' => {
- value *= 16;
- value += c - 'A' + 10;
- },
- '}' => {
- i += 1;
- break;
- },
- else => return .{ .expected_hex_digit_or_rbrace = i },
- }
- if (value > 0x10ffff) {
- return .{ .unicode_escape_overflow = i };
- }
- }
- if (slice[i] != '\'') {
- return .{ .expected_end = i };
- }
- return .{ .success = value };
- },
- else => return .{ .invalid_escape_character = 2 },
- },
- else => {
- const codepoint = std.unicode.utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
- return .{ .success = codepoint };
- },
- }
-}
-
-test "parseCharLiteral" {
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 'a' },
- parseCharLiteral("'a'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 'ä' },
- parseCharLiteral("'ä'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0 },
- parseCharLiteral("'\\x00'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x4f },
- parseCharLiteral("'\\x4f'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x4f },
- parseCharLiteral("'\\x4F'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x3041 },
- parseCharLiteral("'ぁ'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0 },
- parseCharLiteral("'\\u{0}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x3041 },
- parseCharLiteral("'\\u{3041}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x7f },
- parseCharLiteral("'\\u{7f}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .success = 0x7fff },
- parseCharLiteral("'\\u{7FFF}'"),
- );
-
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_hex_digit = 4 },
- parseCharLiteral("'\\x0'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_end = 5 },
- parseCharLiteral("'\\x000'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .invalid_escape_character = 2 },
- parseCharLiteral("'\\y'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_lbrace = 3 },
- parseCharLiteral("'\\u'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_lbrace = 3 },
- parseCharLiteral("'\\uFFFF'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .empty_unicode_escape_sequence = 4 },
- parseCharLiteral("'\\u{}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .unicode_escape_overflow = 9 },
- parseCharLiteral("'\\u{FFFFFF}'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_hex_digit_or_rbrace = 8 },
- parseCharLiteral("'\\u{FFFF'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .expected_end = 9 },
- parseCharLiteral("'\\u{FFFF}x'"),
- );
- try std.testing.expectEqual(
- ParsedCharLiteral{ .invalid_character = 1 },
- parseCharLiteral("'\x00'"),
- );
-}
-
test {
@import("std").testing.refAllDecls(@This());
}
diff --git a/lib/std/zig/string_literal.zig b/lib/std/zig/string_literal.zig
index 1eaab26e3a..07ce08f491 100644
--- a/lib/std/zig/string_literal.zig
+++ b/lib/std/zig/string_literal.zig
@@ -1,129 +1,268 @@
const std = @import("../std.zig");
const assert = std.debug.assert;
+const utf8Decode = std.unicode.utf8Decode;
+const utf8Encode = std.unicode.utf8Encode;
pub const ParseError = error{
OutOfMemory,
- InvalidStringLiteral,
+ InvalidLiteral,
+};
+
+pub const ParsedCharLiteral = union(enum) {
+ success: u21,
+ failure: Error,
};
pub const Result = union(enum) {
success,
- /// Found an invalid character at this index.
+ failure: Error,
+};
+
+pub const Error = union(enum) {
+ /// The character after backslash is missing or not recognized.
+ invalid_escape_character: usize,
+ /// Expected hex digit at this index.
+ expected_hex_digit: usize,
+ /// Unicode escape sequence had no digits with rbrace at this index.
+ empty_unicode_escape_sequence: usize,
+ /// Expected hex digit or '}' at this index.
+ expected_hex_digit_or_rbrace: usize,
+ /// Invalid unicode codepoint at this index.
+ invalid_unicode_codepoint: usize,
+ /// Expected '{' at this index.
+ expected_lbrace: usize,
+ /// Expected '}' at this index.
+ expected_rbrace: usize,
+ /// Expected '\'' at this index.
+ expected_single_quote: usize,
+ /// The character at this index cannot be represented without an escape sequence.
invalid_character: usize,
- /// Expected hex digits at this index.
- expected_hex_digits: usize,
- /// Invalid hex digits at this index.
- invalid_hex_escape: usize,
- /// Invalid unicode escape at this index.
- invalid_unicode_escape: usize,
- /// The left brace at this index is missing a matching right brace.
- missing_matching_rbrace: usize,
- /// Expected unicode digits at this index.
- expected_unicode_digits: usize,
};
+/// Only validates escape sequence characters.
+/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
+pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
+ assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
+
+ switch (slice[1]) {
+ '\\' => {
+ var offset: usize = 1;
+ const result = parseEscapeSequence(slice, &offset);
+ if (result == .success and (offset + 1 != slice.len or slice[offset] != '\''))
+ return .{ .failure = .{ .expected_single_quote = offset } };
+
+ return result;
+ },
+ 0 => return .{ .failure = .{ .invalid_character = 1 } },
+ else => {
+ const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
+ return .{ .success = codepoint };
+ },
+ }
+}
+
+/// Parse an escape sequence from `slice[offset..]`. If parsing is successful,
+/// offset is updated to reflect the characters consumed.
+fn parseEscapeSequence(slice: []const u8, offset: *usize) ParsedCharLiteral {
+ assert(slice.len > offset.*);
+ assert(slice[offset.*] == '\\');
+
+ if (slice.len == offset.* + 1)
+ return .{ .failure = .{ .invalid_escape_character = offset.* + 1 } };
+
+ offset.* += 2;
+ switch (slice[offset.* - 1]) {
+ 'n' => return .{ .success = '\n' },
+ 'r' => return .{ .success = '\r' },
+ '\\' => return .{ .success = '\\' },
+ 't' => return .{ .success = '\t' },
+ '\'' => return .{ .success = '\'' },
+ '"' => return .{ .success = '"' },
+ 'x' => {
+ var value: u8 = 0;
+ var i: usize = offset.*;
+ while (i < offset.* + 2) : (i += 1) {
+ if (i == slice.len) return .{ .failure = .{ .expected_hex_digit = i } };
+
+ const c = slice[i];
+ switch (c) {
+ '0'...'9' => {
+ value *= 16;
+ value += c - '0';
+ },
+ 'a'...'f' => {
+ value *= 16;
+ value += c - 'a' + 10;
+ },
+ 'A'...'F' => {
+ value *= 16;
+ value += c - 'A' + 10;
+ },
+ else => {
+ return .{ .failure = .{ .expected_hex_digit = i } };
+ },
+ }
+ }
+ offset.* = i;
+ return .{ .success = value };
+ },
+ 'u' => {
+ var i: usize = offset.*;
+ if (i >= slice.len or slice[i] != '{') return .{ .failure = .{ .expected_lbrace = i } };
+ i += 1;
+ if (i >= slice.len) return .{ .failure = .{ .expected_hex_digit_or_rbrace = i } };
+ if (slice[i] == '}') return .{ .failure = .{ .empty_unicode_escape_sequence = i } };
+
+ var value: u32 = 0;
+ while (i < slice.len) : (i += 1) {
+ const c = slice[i];
+ switch (c) {
+ '0'...'9' => {
+ value *= 16;
+ value += c - '0';
+ },
+ 'a'...'f' => {
+ value *= 16;
+ value += c - 'a' + 10;
+ },
+ 'A'...'F' => {
+ value *= 16;
+ value += c - 'A' + 10;
+ },
+ '}' => {
+ i += 1;
+ break;
+ },
+ else => return .{ .failure = .{ .expected_hex_digit_or_rbrace = i } },
+ }
+ if (value > 0x10ffff) {
+ return .{ .failure = .{ .invalid_unicode_codepoint = i } };
+ }
+ } else {
+ return .{ .failure = .{ .expected_rbrace = i } };
+ }
+ offset.* = i;
+ return .{ .success = @intCast(u21, value) };
+ },
+ else => return .{ .failure = .{ .invalid_escape_character = offset.* - 1 } },
+ }
+}
+
+test "parseCharLiteral" {
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 'a' },
+ parseCharLiteral("'a'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 'ä' },
+ parseCharLiteral("'ä'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0 },
+ parseCharLiteral("'\\x00'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0x4f },
+ parseCharLiteral("'\\x4f'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0x4f },
+ parseCharLiteral("'\\x4F'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0x3041 },
+ parseCharLiteral("'ぁ'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0 },
+ parseCharLiteral("'\\u{0}'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0x3041 },
+ parseCharLiteral("'\\u{3041}'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0x7f },
+ parseCharLiteral("'\\u{7f}'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .success = 0x7fff },
+ parseCharLiteral("'\\u{7FFF}'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .expected_hex_digit = 4 } },
+ parseCharLiteral("'\\x0'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .expected_single_quote = 5 } },
+ parseCharLiteral("'\\x000'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .invalid_escape_character = 2 } },
+ parseCharLiteral("'\\y'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .expected_lbrace = 3 } },
+ parseCharLiteral("'\\u'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .expected_lbrace = 3 } },
+ parseCharLiteral("'\\uFFFF'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .empty_unicode_escape_sequence = 4 } },
+ parseCharLiteral("'\\u{}'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .invalid_unicode_codepoint = 9 } },
+ parseCharLiteral("'\\u{FFFFFF}'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .expected_hex_digit_or_rbrace = 8 } },
+ parseCharLiteral("'\\u{FFFF'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .expected_single_quote = 9 } },
+ parseCharLiteral("'\\u{FFFF}x'"),
+ );
+ try std.testing.expectEqual(
+ ParsedCharLiteral{ .failure = .{ .invalid_character = 1 } },
+ parseCharLiteral("'\x00'"),
+ );
+}
+
/// Parses `bytes` as a Zig string literal and appends the result to `buf`.
/// Asserts `bytes` has '"' at beginning and end.
pub fn parseAppend(buf: *std.ArrayList(u8), bytes: []const u8) error{OutOfMemory}!Result {
assert(bytes.len >= 2 and bytes[0] == '"' and bytes[bytes.len - 1] == '"');
- const slice = bytes[1..];
-
- const prev_len = buf.items.len;
- try buf.ensureUnusedCapacity(slice.len - 1);
- errdefer buf.shrinkRetainingCapacity(prev_len);
-
- const State = enum {
- Start,
- Backslash,
- };
-
- var state = State.Start;
- var index: usize = 0;
- while (true) : (index += 1) {
- const b = slice[index];
-
- switch (state) {
- State.Start => switch (b) {
- '\\' => state = State.Backslash,
- '\n' => {
- return Result{ .invalid_character = index };
- },
- '"' => return Result.success,
- else => try buf.append(b),
- },
- State.Backslash => switch (b) {
- 'n' => {
- try buf.append('\n');
- state = State.Start;
- },
- 'r' => {
- try buf.append('\r');
- state = State.Start;
- },
- '\\' => {
- try buf.append('\\');
- state = State.Start;
- },
- 't' => {
- try buf.append('\t');
- state = State.Start;
- },
- '\'' => {
- try buf.append('\'');
- state = State.Start;
- },
- '"' => {
- try buf.append('"');
- state = State.Start;
- },
- 'x' => {
- // TODO: add more/better/broader tests for this.
- const index_continue = index + 3;
- if (slice.len < index_continue) {
- return Result{ .expected_hex_digits = index };
- }
- if (std.fmt.parseUnsigned(u8, slice[index + 1 .. index_continue], 16)) |byte| {
- try buf.append(byte);
- state = State.Start;
- index = index_continue - 1; // loop-header increments again
- } else |err| switch (err) {
- error.Overflow => unreachable, // 2 digits base 16 fits in a u8.
- error.InvalidCharacter => {
- return Result{ .invalid_hex_escape = index + 1 };
- },
- }
- },
- 'u' => {
- // TODO: add more/better/broader tests for this.
- // TODO: we are already inside a nice, clean state machine... use it
- // instead of this hacky code.
- if (slice.len > index + 2 and slice[index + 1] == '{') {
- if (std.mem.indexOfScalarPos(u8, slice[0..std.math.min(index + 9, slice.len)], index + 3, '}')) |index_end| {
- const hex_str = slice[index + 2 .. index_end];
- if (std.fmt.parseUnsigned(u32, hex_str, 16)) |uint| {
- if (uint <= 0x10ffff) {
- // TODO this incorrectly depends on endianness
- try buf.appendSlice(std.mem.toBytes(uint)[0..]);
- state = State.Start;
- index = index_end; // loop-header increments
- continue;
- }
- } else |err| switch (err) {
- error.Overflow => unreachable,
- error.InvalidCharacter => {
- return Result{ .invalid_unicode_escape = index + 1 };
- },
- }
+ try buf.ensureUnusedCapacity(bytes.len - 2);
+
+ var index: usize = 1;
+ while (true) {
+ const b = bytes[index];
+
+ switch (b) {
+ '\\' => {
+ const escape_char_index = index + 1;
+ const result = parseEscapeSequence(bytes, &index);
+ switch (result) {
+ .success => |codepoint| {
+ if (bytes[escape_char_index] == 'u') {
+ buf.items.len += utf8Encode(codepoint, buf.unusedCapacitySlice()) catch {
+ return Result{ .failure = .{ .invalid_unicode_codepoint = escape_char_index + 1 } };
+ };
} else {
- return Result{ .missing_matching_rbrace = index + 1 };
+ buf.appendAssumeCapacity(@intCast(u8, codepoint));
}
- } else {
- return Result{ .expected_unicode_digits = index };
- }
- },
- else => {
- return Result{ .invalid_character = index };
- },
+ },
+ .failure => |err| return Result{ .failure = err },
+ }
+ },
+ '\n' => return Result{ .failure = .{ .invalid_character = index } },
+ '"' => return Result.success,
+ else => {
+ try buf.append(b);
+ index += 1;
},
}
} else unreachable; // TODO should not need else unreachable on while(true)
@@ -137,18 +276,23 @@ pub fn parseAlloc(allocator: std.mem.Allocator, bytes: []const u8) ParseError![]
switch (try parseAppend(&buf, bytes)) {
.success => return buf.toOwnedSlice(),
- else => return error.InvalidStringLiteral,
+ .failure => return error.InvalidLiteral,
}
}
test "parse" {
const expect = std.testing.expect;
+ const expectError = std.testing.expectError;
const eql = std.mem.eql;
- var fixed_buf_mem: [32]u8 = undefined;
- var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(fixed_buf_mem[0..]);
+ var fixed_buf_mem: [64]u8 = undefined;
+ var fixed_buf_alloc = std.heap.FixedBufferAllocator.init(&fixed_buf_mem);
var alloc = fixed_buf_alloc.allocator();
+ try expectError(error.InvalidLiteral, parseAlloc(alloc, "\"\\x6\""));
+ try expect(eql(u8, "foo\nbar", try parseAlloc(alloc, "\"foo\\nbar\"")));
+ try expect(eql(u8, "\x12foo", try parseAlloc(alloc, "\"\\x12foo\"")));
+ try expect(eql(u8, "bytes\u{1234}foo", try parseAlloc(alloc, "\"bytes\\u{1234}foo\"")));
try expect(eql(u8, "foo", try parseAlloc(alloc, "\"foo\"")));
try expect(eql(u8, "foo", try parseAlloc(alloc, "\"f\x6f\x6f\"")));
try expect(eql(u8, "f💯", try parseAlloc(alloc, "\"f\u{1f4af}\"")));