diff options
| -rw-r--r-- | lib/std/json.zig | 835 | ||||
| -rw-r--r-- | lib/std/meta.zig | 4 |
2 files changed, 468 insertions, 371 deletions
diff --git a/lib/std/json.zig b/lib/std/json.zig index e278c49f00..a7e98ad1a5 100644 --- a/lib/std/json.zig +++ b/lib/std/json.zig @@ -4,83 +4,63 @@ const std = @import("std.zig"); const debug = std.debug; +const assert = debug.assert; const testing = std.testing; const mem = std.mem; const maxInt = std.math.maxInt; pub const WriteStream = @import("json/write_stream.zig").WriteStream; +const StringEscapes = union(enum) { + None, + + Some: struct { + size_diff: isize, + }, +}; + /// A single token slice into the parent string. /// /// Use `token.slice()` on the input at the current position to get the current slice. -pub const Token = struct { - id: Id, - /// How many bytes do we skip before counting - offset: u1, - /// Whether string contains an escape sequence and cannot be zero-copied - string_has_escape: bool, - /// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`) - number_is_integer: bool, - /// How many bytes from the current position behind the start of this token is. - count: usize, - - pub const Id = enum { - ObjectBegin, - ObjectEnd, - ArrayBegin, - ArrayEnd, - String, - Number, - True, - False, - Null, - }; - - pub fn init(id: Id, count: usize, offset: u1) Token { - return Token{ - .id = id, - .offset = offset, - .string_has_escape = false, - .number_is_integer = true, - .count = count, - }; - } - - pub fn initString(count: usize, has_unicode_escape: bool) Token { - return Token{ - .id = Id.String, - .offset = 0, - .string_has_escape = has_unicode_escape, - .number_is_integer = true, - .count = count, - }; - } +pub const Token = union(enum) { + ObjectBegin, + ObjectEnd, + ArrayBegin, + ArrayEnd, + String: struct { + /// How many bytes the token is. + count: usize, + + /// Whether string contains an escape sequence and cannot be zero-copied + escapes: StringEscapes, + + pub fn decodedLength(self: @This()) usize { + return self.count +% switch (self.escapes) { + .None => 0, + .Some => |s| @bitCast(usize, s.size_diff), + }; + } - pub fn initNumber(count: usize, number_is_integer: bool) Token { - return Token{ - .id = Id.Number, - .offset = 0, - .string_has_escape = false, - .number_is_integer = number_is_integer, - .count = count, - }; - } + /// Slice into the underlying input string. + pub fn slice(self: @This(), input: []const u8, i: usize) []const u8 { + return input[i - self.count .. i]; + } + }, + Number: struct { + /// How many bytes the token is. + count: usize, - /// A marker token is a zero-length - pub fn initMarker(id: Id) Token { - return Token{ - .id = id, - .offset = 0, - .string_has_escape = false, - .number_is_integer = true, - .count = 0, - }; - } + /// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`) + is_integer: bool, - /// Slice into the underlying input string. - pub fn slice(self: Token, input: []const u8, i: usize) []const u8 { - return input[i + self.offset - self.count .. i + self.offset]; - } + /// Slice into the underlying input string. + pub fn slice(self: @This(), input: []const u8, i: usize) []const u8 { + return input[i - self.count .. i]; + } + }, + True, + False, + Null, }; /// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as @@ -102,7 +82,12 @@ pub const StreamingParser = struct { // If we stopped now, would the complete parsed string to now be a valid json string complete: bool, // Current token flags to pass through to the next generated, see Token. - string_has_escape: bool, + string_escapes: StringEscapes, + // When in .String states, was the previous character a high surrogate? + string_last_was_high_surrogate: bool, + // Used inside of StringEscapeHexUnicode* states + string_unicode_codepoint: u21, + // When in .Number states, is the number a (still) valid integer? number_is_integer: bool, // Bit-stack for nested object/map literals (max 255 nestings). @@ -120,16 +105,18 @@ pub const StreamingParser = struct { } pub fn reset(p: *StreamingParser) void { - p.state = State.TopLevelBegin; + p.state = .TopLevelBegin; p.count = 0; // Set before ever read in main transition function p.after_string_state = undefined; - p.after_value_state = State.ValueEnd; // handle end of values normally + p.after_value_state = .ValueEnd; // handle end of values normally p.stack = 0; p.stack_used = 0; p.complete = false; - p.string_has_escape = false; - p.number_is_integer = true; + p.string_escapes = undefined; + p.string_last_was_high_surrogate = undefined; + p.string_unicode_codepoint = undefined; + p.number_is_integer = undefined; } pub const State = enum { @@ -223,66 +210,67 @@ pub const StreamingParser = struct { // Perform a single transition on the state machine and return any possible token. fn transition(p: *StreamingParser, c: u8, token: *?Token) Error!bool { switch (p.state) { - State.TopLevelBegin => switch (c) { + .TopLevelBegin => switch (c) { '{' => { p.stack <<= 1; p.stack |= object_bit; p.stack_used += 1; - p.state = State.ValueBegin; - p.after_string_state = State.ObjectSeparator; + p.state = .ValueBegin; + p.after_string_state = .ObjectSeparator; - token.* = Token.initMarker(Token.Id.ObjectBegin); + token.* = Token.ObjectBegin; }, '[' => { p.stack <<= 1; p.stack |= array_bit; p.stack_used += 1; - p.state = State.ValueBegin; - p.after_string_state = State.ValueEnd; + p.state = .ValueBegin; + p.after_string_state = .ValueEnd; - token.* = Token.initMarker(Token.Id.ArrayBegin); + token.* = Token.ArrayBegin; }, '-' => { p.number_is_integer = true; - p.state = State.Number; - p.after_value_state = State.TopLevelEnd; + p.state = .Number; + p.after_value_state = .TopLevelEnd; p.count = 0; }, '0' => { p.number_is_integer = true; - p.state = State.NumberMaybeDotOrExponent; - p.after_value_state = State.TopLevelEnd; + p.state = .NumberMaybeDotOrExponent; + p.after_value_state = .TopLevelEnd; p.count = 0; }, '1'...'9' => { p.number_is_integer = true; - p.state = State.NumberMaybeDigitOrDotOrExponent; - p.after_value_state = State.TopLevelEnd; + p.state = .NumberMaybeDigitOrDotOrExponent; + p.after_value_state = .TopLevelEnd; p.count = 0; }, '"' => { - p.state = State.String; - p.after_value_state = State.TopLevelEnd; + p.state = .String; + p.after_value_state = .TopLevelEnd; // We don't actually need the following since after_value_state should override. - p.after_string_state = State.ValueEnd; - p.string_has_escape = false; + p.after_string_state = .ValueEnd; + p.string_escapes = .None; + p.string_last_was_high_surrogate = false; p.count = 0; }, 't' => { - p.state = State.TrueLiteral1; - p.after_value_state = State.TopLevelEnd; + p.state = .TrueLiteral1; + p.after_value_state = .TopLevelEnd; p.count = 0; }, 'f' => { - p.state = State.FalseLiteral1; - p.after_value_state = State.TopLevelEnd; + p.state = .FalseLiteral1; + p.after_value_state = .TopLevelEnd; p.count = 0; }, 'n' => { - p.state = State.NullLiteral1; - p.after_value_state = State.TopLevelEnd; + p.state = .NullLiteral1; + p.after_value_state = .TopLevelEnd; p.count = 0; }, 0x09, 0x0A, 0x0D, 0x20 => { @@ -293,7 +281,7 @@ pub const StreamingParser = struct { }, }, - State.TopLevelEnd => switch (c) { + .TopLevelEnd => switch (c) { 0x09, 0x0A, 0x0D, 0x20 => { // whitespace }, @@ -302,7 +290,7 @@ pub const StreamingParser = struct { }, }, - State.ValueBegin => switch (c) { + .ValueBegin => switch (c) { // NOTE: These are shared in ValueEnd as well, think we can reorder states to // be a bit clearer and avoid this duplication. '}' => { @@ -314,7 +302,7 @@ pub const StreamingParser = struct { return error.TooManyClosingItems; } - p.state = State.ValueBegin; + p.state = .ValueBegin; p.after_string_state = State.fromInt(p.stack & 1); p.stack >>= 1; @@ -323,14 +311,14 @@ pub const StreamingParser = struct { switch (p.stack_used) { 0 => { p.complete = true; - p.state = State.TopLevelEnd; + p.state = .TopLevelEnd; }, else => { - p.state = State.ValueEnd; + p.state = .ValueEnd; }, } - token.* = Token.initMarker(Token.Id.ObjectEnd); + token.* = Token.ObjectEnd; }, ']' => { if (p.stack & 1 != array_bit) { @@ -340,7 +328,7 @@ pub const StreamingParser = struct { return error.TooManyClosingItems; } - p.state = State.ValueBegin; + p.state = .ValueBegin; p.after_string_state = State.fromInt(p.stack & 1); p.stack >>= 1; @@ -349,14 +337,14 @@ pub const StreamingParser = struct { switch (p.stack_used) { 0 => { p.complete = true; - p.state = State.TopLevelEnd; + p.state = .TopLevelEnd; }, else => { - p.state = State.ValueEnd; + p.state = .ValueEnd; }, } - token.* = Token.initMarker(Token.Id.ArrayEnd); + token.* = Token.ArrayEnd; }, '{' => { if (p.stack_used == max_stack_size) { @@ -367,10 +355,10 @@ pub const StreamingParser = struct { p.stack |= object_bit; p.stack_used += 1; - p.state = State.ValueBegin; - p.after_string_state = State.ObjectSeparator; + p.state = .ValueBegin; + p.after_string_state = .ObjectSeparator; - token.* = Token.initMarker(Token.Id.ObjectBegin); + token.* = Token.ObjectBegin; }, '[' => { if (p.stack_used == max_stack_size) { @@ -381,40 +369,42 @@ pub const StreamingParser = struct { p.stack |= array_bit; p.stack_used += 1; - p.state = State.ValueBegin; - p.after_string_state = State.ValueEnd; + p.state = .ValueBegin; + p.after_string_state = .ValueEnd; - token.* = Token.initMarker(Token.Id.ArrayBegin); + token.* = Token.ArrayBegin; }, '-' => { p.number_is_integer = true; - p.state = State.Number; + p.state = .Number; p.count = 0; }, '0' => { p.number_is_integer = true; - p.state = State.NumberMaybeDotOrExponent; + p.state = .NumberMaybeDotOrExponent; p.count = 0; }, '1'...'9' => { p.number_is_integer = true; - p.state = State.NumberMaybeDigitOrDotOrExponent; + p.state = .NumberMaybeDigitOrDotOrExponent; p.count = 0; }, '"' => { - p.state = State.String; + p.state = .String; + p.string_escapes = .None; + p.string_last_was_high_surrogate = false; p.count = 0; }, 't' => { - p.state = State.TrueLiteral1; + p.state = .TrueLiteral1; p.count = 0; }, 'f' => { - p.state = State.FalseLiteral1; + p.state = .FalseLiteral1; p.count = 0; }, 'n' => { - p.state = State.NullLiteral1; + p.state = .NullLiteral1; p.count = 0; }, 0x09, 0x0A, 0x0D, 0x20 => { @@ -426,7 +416,7 @@ pub const StreamingParser = struct { }, // TODO: A bit of duplication here and in the following state, redo. - State.ValueBeginNoClosing => switch (c) { + .ValueBeginNoClosing => switch (c) { '{' => { if (p.stack_used == max_stack_size) { return error.TooManyNestedItems; @@ -436,10 +426,10 @@ pub const StreamingParser = struct { p.stack |= object_bit; p.stack_used += 1; - p.state = State.ValueBegin; - p.after_string_state = State.ObjectSeparator; + p.state = .ValueBegin; + p.after_string_state = .ObjectSeparator; - token.* = Token.initMarker(Token.Id.ObjectBegin); + token.* = Token.ObjectBegin; }, '[' => { if (p.stack_used == max_stack_size) { @@ -450,40 +440,42 @@ pub const StreamingParser = struct { p.stack |= array_bit; p.stack_used += 1; - p.state = State.ValueBegin; - p.after_string_state = State.ValueEnd; + p.state = .ValueBegin; + p.after_string_state = .ValueEnd; - token.* = Token.initMarker(Token.Id.ArrayBegin); + token.* = Token.ArrayBegin; }, '-' => { p.number_is_integer = true; - p.state = State.Number; + p.state = .Number; p.count = 0; }, '0' => { p.number_is_integer = true; - p.state = State.NumberMaybeDotOrExponent; + p.state = .NumberMaybeDotOrExponent; p.count = 0; }, '1'...'9' => { p.number_is_integer = true; - p.state = State.NumberMaybeDigitOrDotOrExponent; + p.state = .NumberMaybeDigitOrDotOrExponent; p.count = 0; }, '"' => { - p.state = State.String; + p.state = .String; + p.string_escapes = .None; + p.string_last_was_high_surrogate = false; p.count = 0; }, 't' => { - p.state = State.TrueLiteral1; + p.state = .TrueLiteral1; p.count = 0; }, 'f' => { - p.state = State.FalseLiteral1; + p.state = .FalseLiteral1; p.count = 0; }, 'n' => { - p.state = State.NullLiteral1; + p.state = .NullLiteral1; p.count = 0; }, 0x09, 0x0A, 0x0D, 0x20 => { @@ -494,17 +486,17 @@ pub const StreamingParser = struct { }, }, - State.ValueEnd => switch (c) { + .ValueEnd => switch (c) { ',' => { p.after_string_state = State.fromInt(p.stack & 1); - p.state = State.ValueBeginNoClosing; + p.state = .ValueBeginNoClosing; }, ']' => { if (p.stack_used == 0) { return error.UnbalancedBrackets; } - p.state = State.ValueEnd; + p.state = .ValueEnd; p.after_string_state = State.fromInt(p.stack & 1); p.stack >>= 1; @@ -512,17 +504,17 @@ pub const StreamingParser = struct { if (p.stack_used == 0) { p.complete = true; - p.state = State.TopLevelEnd; + p.state = .TopLevelEnd; } - token.* = Token.initMarker(Token.Id.ArrayEnd); + token.* = Token.ArrayEnd; }, '}' => { if (p.stack_used == 0) { return error.UnbalancedBraces; } - p.state = State.ValueEnd; + p.state = .ValueEnd; p.after_string_state = State.fromInt(p.stack & 1); p.stack >>= 1; @@ -530,10 +522,10 @@ pub const StreamingParser = struct { if (p.stack_used == 0) { p.complete = true; - p.state = State.TopLevelEnd; + p.state = .TopLevelEnd; } - token.* = Token.initMarker(Token.Id.ObjectEnd); + token.* = Token.ObjectEnd; }, 0x09, 0x0A, 0x0D, 0x20 => { // whitespace @@ -543,10 +535,10 @@ pub const StreamingParser = struct { }, }, - State.ObjectSeparator => switch (c) { + .ObjectSeparator => switch (c) { ':' => { - p.state = State.ValueBegin; - p.after_string_state = State.ValueEnd; + p.state = .ValueBegin; + p.after_string_state = .ValueEnd; }, 0x09, 0x0A, 0x0D, 0x20 => { // whitespace @@ -556,55 +548,72 @@ pub const StreamingParser = struct { }, }, - State.String => switch (c) { + .String => switch (c) { 0x00...0x1F => { return error.InvalidControlCharacter; }, '"' => { p.state = p.after_string_state; - if (p.after_value_state == State.TopLevelEnd) { - p.state = State.TopLevelEnd; + if (p.after_value_state == .TopLevelEnd) { + p.state = .TopLevelEnd; p.complete = true; } - token.* = Token.initString(p.count - 1, p.string_has_escape); + token.* = .{ + .String = .{ + .count = p.count - 1, + .escapes = p.string_escapes, + }, + }; + p.string_escapes = undefined; + p.string_last_was_high_surrogate = undefined; }, '\\' => { - p.state = State.StringEscapeCharacter; + p.state = .StringEscapeCharacter; + switch (p.string_escapes) { + .None => { + p.string_escapes = .{ .Some = .{ .size_diff = 0 } }; + }, + .Some => {}, + } }, 0x20, 0x21, 0x23...0x5B, 0x5D...0x7F => { // non-control ascii + p.string_last_was_high_surrogate = false; }, 0xC0...0xDF => { - p.state = State.StringUtf8Byte1; + p.state = .StringUtf8Byte1; }, 0xE0...0xEF => { - p.state = State.StringUtf8Byte2; + p.state = .StringUtf8Byte2; }, 0xF0...0xFF => { - p.state = State.StringUtf8Byte3; + p.state = .StringUtf8Byte3; }, else => { return error.InvalidUtf8Byte; }, }, - State.StringUtf8Byte3 => switch (c >> 6) { - 0b10 => p.state = State.StringUtf8Byte2, + .StringUtf8Byte3 => switch (c >> 6) { + 0b10 => p.state = .StringUtf8Byte2, else => return error.InvalidUtf8Byte, }, - State.StringUtf8Byte2 => switch (c >> 6) { - 0b10 => p.state = State.StringUtf8Byte1, + .StringUtf8Byte2 => switch (c >> 6) { + 0b10 => p.state = .StringUtf8Byte1, else => return error.InvalidUtf8Byte, }, - State.StringUtf8Byte1 => switch (c >> 6) { - 0b10 => p.state = State.String, + .StringUtf8Byte1 => switch (c >> 6) { + 0b10 => { + p.state = .String; + p.string_last_was_high_surrogate = false; + }, else => return error.InvalidUtf8Byte, }, - State.StringEscapeCharacter => switch (c) { + .StringEscapeCharacter => switch (c) { // NOTE: '/' is allowed as an escaped character but it also is allowed // as unescaped according to the RFC. There is a reported errata which suggests // removing the non-escaped variant but it makes more sense to simply disallow @@ -614,54 +623,121 @@ pub const StreamingParser = struct { // however, so we default to the status quo where both are accepted until this // is further clarified. '"', '\\', '/', 'b', 'f', 'n', 'r', 't' => { - p.string_has_escape = true; - p.state = State.String; + p.string_escapes.Some.size_diff -= 1; + p.state = .String; + p.string_last_was_high_surrogate = false; }, 'u' => { - p.string_has_escape = true; - p.state = State.StringEscapeHexUnicode4; + p.state = .StringEscapeHexUnicode4; }, else => { return error.InvalidEscapeCharacter; }, }, - State.StringEscapeHexUnicode4 => switch (c) { - '0'...'9', 'A'...'F', 'a'...'f' => { - p.state = State.StringEscapeHexUnicode3; - }, - else => return error.InvalidUnicodeHexSymbol, + .StringEscapeHexUnicode4 => { + var codepoint: u21 = undefined; + switch (c) { + else => return error.InvalidUnicodeHexSymbol, + '0'...'9' => { + codepoint = c - '0'; + }, + 'A'...'F' => { + codepoint = c - 'A' + 10; + }, + 'a'...'f' => { + codepoint = c - 'a' + 10; + }, + } + p.state = .StringEscapeHexUnicode3; + p.string_unicode_codepoint = codepoint << 12; }, - State.StringEscapeHexUnicode3 => switch (c) { - '0'...'9', 'A'...'F', 'a'...'f' => { - p.state = State.StringEscapeHexUnicode2; - }, - else => return error.InvalidUnicodeHexSymbol, + .StringEscapeHexUnicode3 => { + var codepoint: u21 = undefined; + switch (c) { + else => return error.InvalidUnicodeHexSymbol, + '0'...'9' => { + codepoint = c - '0'; + }, + 'A'...'F' => { + codepoint = c - 'A' + 10; + }, + 'a'...'f' => { + codepoint = c - 'a' + 10; + }, + } + p.state = .StringEscapeHexUnicode2; + p.string_unicode_codepoint |= codepoint << 8; }, - State.StringEscapeHexUnicode2 => switch (c) { - '0'...'9', 'A'...'F', 'a'...'f' => { - p.state = State.StringEscapeHexUnicode1; - }, - else => return error.InvalidUnicodeHexSymbol, + .StringEscapeHexUnicode2 => { + var codepoint: u21 = undefined; + switch (c) { + else => return error.InvalidUnicodeHexSymbol, + '0'...'9' => { + codepoint = c - '0'; + }, + 'A'...'F' => { + codepoint = c - 'A' + 10; + }, + 'a'...'f' => { + codepoint = c - 'a' + 10; + }, + } + p.state = .StringEscapeHexUnicode1; + p.string_unicode_codepoint |= codepoint << 4; }, - State.StringEscapeHexUnicode1 => switch (c) { - '0'...'9', 'A'...'F', 'a'...'f' => { - p.state = State.String; - }, - else => return error.InvalidUnicodeHexSymbol, + .StringEscapeHexUnicode1 => { + var codepoint: u21 = undefined; + switch (c) { + else => return error.InvalidUnicodeHexSymbol, + '0'...'9' => { + codepoint = c - '0'; + }, + 'A'...'F' => { + codepoint = c - 'A' + 10; + }, + 'a'...'f' => { + codepoint = c - 'a' + 10; + }, + } + p.state = .String; + p.string_unicode_codepoint |= codepoint; + if (p.string_unicode_codepoint < 0xD800 or p.string_unicode_codepoint >= 0xE000) { + // not part of surrogate pair + p.string_escapes.Some.size_diff -= @as(isize, 6 - (std.unicode.utf8CodepointSequenceLength(p.string_unicode_codepoint) catch unreachable)); + p.string_last_was_high_surrogate = false; + } else if (p.string_unicode_codepoint < 0xDC00) { + // 'high' surrogate + // takes 3 bytes to encode a half surrogate pair into wtf8 + p.string_escapes.Some.size_diff -= 6 - 3; + p.string_last_was_high_surrogate = true; + } else { + // 'low' surrogate + p.string_escapes.Some.size_diff -= 6; + if (p.string_last_was_high_surrogate) { + // takes 4 bytes to encode a full surrogate pair into utf8 + // 3 bytes are already reserved by high surrogate + p.string_escapes.Some.size_diff -= -1; + } else { + // takes 3 bytes to encode a half surrogate pair into wtf8 + p.string_escapes.Some.size_diff -= -3; + } + p.string_last_was_high_surrogate = false; + } + p.string_unicode_codepoint = undefined; }, - State.Number => { - p.complete = p.after_value_state == State.TopLevelEnd; + .Number => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { '0' => { - p.state = State.NumberMaybeDotOrExponent; + p.state = .NumberMaybeDotOrExponent; }, '1'...'9' => { - p.state = State.NumberMaybeDigitOrDotOrExponent; + p.state = .NumberMaybeDigitOrDotOrExponent; }, else => { return error.InvalidNumber; @@ -669,52 +745,63 @@ pub const StreamingParser = struct { } }, - State.NumberMaybeDotOrExponent => { - p.complete = p.after_value_state == State.TopLevelEnd; + .NumberMaybeDotOrExponent => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { '.' => { p.number_is_integer = false; - p.state = State.NumberFractionalRequired; + p.state = .NumberFractionalRequired; }, 'e', 'E' => { p.number_is_integer = false; - p.state = State.NumberExponent; + p.state = .NumberExponent; }, else => { p.state = p.after_value_state; - token.* = Token.initNumber(p.count, p.number_is_integer); + token.* = .{ + .Number = .{ + .count = p.count, + .is_integer = p.number_is_integer, + }, + }; + p.number_is_integer = undefined; return true; }, } }, - State.NumberMaybeDigitOrDotOrExponent => { - p.complete = p.after_value_state == State.TopLevelEnd; + .NumberMaybeDigitOrDotOrExponent => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { '.' => { p.number_is_integer = false; - p.state = State.NumberFractionalRequired; + p.state = .NumberFractionalRequired; }, 'e', 'E' => { p.number_is_integer = false; - p.state = State.NumberExponent; + p.state = .NumberExponent; }, '0'...'9' => { // another digit }, else => { p.state = p.after_value_state; - token.* = Token.initNumber(p.count, p.number_is_integer); + token.* = .{ + .Number = .{ + .count = p.count, + .is_integer = p.number_is_integer, + }, + }; return true; }, } }, - State.NumberFractionalRequired => { - p.complete = p.after_value_state == State.TopLevelEnd; + .NumberFractionalRequired => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { '0'...'9' => { - p.state = State.NumberFractional; + p.state = .NumberFractional; }, else => { return error.InvalidNumber; @@ -722,139 +809,154 @@ pub const StreamingParser = struct { } }, - State.NumberFractional => { - p.complete = p.after_value_state == State.TopLevelEnd; + .NumberFractional => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { '0'...'9' => { // another digit }, 'e', 'E' => { p.number_is_integer = false; - p.state = State.NumberExponent; + p.state = .NumberExponent; }, else => { p.state = p.after_value_state; - token.* = Token.initNumber(p.count, p.number_is_integer); + token.* = .{ + .Number = .{ + .count = p.count, + .is_integer = p.number_is_integer, + }, + }; return true; }, } }, - State.NumberMaybeExponent => { - p.complete = p.after_value_state == State.TopLevelEnd; + .NumberMaybeExponent => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { 'e', 'E' => { p.number_is_integer = false; - p.state = State.NumberExponent; + p.state = .NumberExponent; }, else => { p.state = p.after_value_state; - token.* = Token.initNumber(p.count, p.number_is_integer); + token.* = .{ + .Number = .{ + .count = p.count, + .is_integer = p.number_is_integer, + }, + }; return true; }, } }, - State.NumberExponent => switch (c) { + .NumberExponent => switch (c) { '-', '+' => { p.complete = false; - p.state = State.NumberExponentDigitsRequired; + p.state = .NumberExponentDigitsRequired; }, '0'...'9' => { - p.complete = p.after_value_state == State.TopLevelEnd; - p.state = State.NumberExponentDigits; + p.complete = p.after_value_state == .TopLevelEnd; + p.state = .NumberExponentDigits; }, else => { return error.InvalidNumber; }, }, - State.NumberExponentDigitsRequired => switch (c) { + .NumberExponentDigitsRequired => switch (c) { '0'...'9' => { - p.complete = p.after_value_state == State.TopLevelEnd; - p.state = State.NumberExponentDigits; + p.complete = p.after_value_state == .TopLevelEnd; + p.state = .NumberExponentDigits; }, else => { return error.InvalidNumber; }, }, - State.NumberExponentDigits => { - p.complete = p.after_value_state == State.TopLevelEnd; + .NumberExponentDigits => { + p.complete = p.after_value_state == .TopLevelEnd; switch (c) { '0'...'9' => { // another digit }, else => { p.state = p.after_value_state; - token.* = Token.initNumber(p.count, p.number_is_integer); + token.* = .{ + .Number = .{ + .count = p.count, + .is_integer = p.number_is_integer, + }, + }; return true; }, } }, - State.TrueLiteral1 => switch (c) { - 'r' => p.state = State.TrueLiteral2, + .TrueLiteral1 => switch (c) { + 'r' => p.state = .TrueLiteral2, else => return error.InvalidLiteral, }, - State.TrueLiteral2 => switch (c) { - 'u' => p.state = State.TrueLiteral3, + .TrueLiteral2 => switch (c) { + 'u' => p.state = .TrueLiteral3, else => return error.InvalidLiteral, }, - State.TrueLiteral3 => switch (c) { + .TrueLiteral3 => switch (c) { 'e' => { p.state = p.after_value_state; - p.complete = p.state == State.TopLevelEnd; - token.* = Token.init(Token.Id.True, p.count + 1, 1); + p.complete = p.state == .TopLevelEnd; + token.* = Token.True; }, else => { return error.InvalidLiteral; }, }, - State.FalseLiteral1 => switch (c) { - 'a' => p.state = State.FalseLiteral2, + .FalseLiteral1 => switch (c) { + 'a' => p.state = .FalseLiteral2, else => return error.InvalidLiteral, }, - State.FalseLiteral2 => switch (c) { - 'l' => p.state = State.FalseLiteral3, + .FalseLiteral2 => switch (c) { + 'l' => p.state = .FalseLiteral3, else => return error.InvalidLiteral, }, - State.FalseLiteral3 => switch (c) { - 's' => p.state = State.FalseLiteral4, + .FalseLiteral3 => switch (c) { + 's' => p.state = .FalseLiteral4, else => return error.InvalidLiteral, }, - State.FalseLiteral4 => switch (c) { + .FalseLiteral4 => switch (c) { 'e' => { p.state = p.after_value_state; - p.complete = p.state == State.TopLevelEnd; - token.* = Token.init(Token.Id.False, p.count + 1, 1); + p.complete = p.state == .TopLevelEnd; + token.* = Token.False; }, else => { return error.InvalidLiteral; }, }, - State.NullLiteral1 => switch (c) { - 'u' => p.state = State.NullLiteral2, + .NullLiteral1 => switch (c) { + 'u' => p.state = .NullLiteral2, else => return error.InvalidLiteral, }, - State.NullLiteral2 => switch (c) { - 'l' => p.state = State.NullLiteral3, + .NullLiteral2 => switch (c) { + 'l' => p.state = .NullLiteral3, else => return error.InvalidLiteral, }, - State.NullLiteral3 => switch (c) { + .NullLiteral3 => switch (c) { 'l' => { p.state = p.after_value_state; - p.complete = p.state == State.TopLevelEnd; - token.* = Token.init(Token.Id.Null, p.count + 1, 1); + p.complete = p.state == .TopLevelEnd; + token.* = Token.Null; }, else => { return error.InvalidLiteral; @@ -905,7 +1007,7 @@ pub const TokenStream = struct { } } - // Without this a bare number fails, becasue the streaming parser doesn't know it ended + // Without this a bare number fails, the streaming parser doesn't know the input ended try self.parser.feed(' ', &t1, &t2); self.i += 1; @@ -919,9 +1021,9 @@ pub const TokenStream = struct { } }; -fn checkNext(p: *TokenStream, id: Token.Id) void { +fn checkNext(p: *TokenStream, id: std.meta.TagType(Token)) void { const token = (p.next() catch unreachable).?; - debug.assert(token.id == id); + debug.assert(std.meta.activeTag(token) == id); } test "json.token" { @@ -944,35 +1046,35 @@ test "json.token" { var p = TokenStream.init(s); - checkNext(&p, Token.Id.ObjectBegin); - checkNext(&p, Token.Id.String); // Image - checkNext(&p, Token.Id.ObjectBegin); - checkNext(&p, Token.Id.String); // Width - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.String); // Height - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.String); // Title - checkNext(&p, Token.Id.String); - checkNext(&p, Token.Id.String); // Thumbnail - checkNext(&p, Token.Id.ObjectBegin); - checkNext(&p, Token.Id.String); // Url - checkNext(&p, Token.Id.String); - checkNext(&p, Token.Id.String); // Height - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.String); // Width - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.ObjectEnd); - checkNext(&p, Token.Id.String); // Animated - checkNext(&p, Token.Id.False); - checkNext(&p, Token.Id.String); // IDs - checkNext(&p, Token.Id.ArrayBegin); - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.Number); - checkNext(&p, Token.Id.ArrayEnd); - checkNext(&p, Token.Id.ObjectEnd); - checkNext(&p, Token.Id.ObjectEnd); + checkNext(&p, .ObjectBegin); + checkNext(&p, .String); // Image + checkNext(&p, .ObjectBegin); + checkNext(&p, .String); // Width + checkNext(&p, .Number); + checkNext(&p, .String); // Height + checkNext(&p, .Number); + checkNext(&p, .String); // Title + checkNext(&p, .String); + checkNext(&p, .String); // Thumbnail + checkNext(&p, .ObjectBegin); + checkNext(&p, .String); // Url + checkNext(&p, .String); + checkNext(&p, .String); // Height + checkNext(&p, .Number); + checkNext(&p, .String); // Width + checkNext(&p, .Number); + checkNext(&p, .ObjectEnd); + checkNext(&p, .String); // Animated + checkNext(&p, .False); + checkNext(&p, .String); // IDs + checkNext(&p, .ArrayBegin); + checkNext(&p, .Number); + checkNext(&p, .Number); + checkNext(&p, .Number); + checkNext(&p, .Number); + checkNext(&p, .ArrayEnd); + checkNext(&p, .ObjectEnd); + checkNext(&p, .ObjectEnd); testing.expect((try p.next()) == null); } @@ -1081,7 +1183,7 @@ pub const Parser = struct { pub fn init(allocator: *Allocator, copy_strings: bool) Parser { return Parser{ .allocator = allocator, - .state = State.Simple, + .state = .Simple, .copy_strings = copy_strings, .stack = Array.init(allocator), }; @@ -1092,7 +1194,7 @@ pub const Parser = struct { } pub fn reset(p: *Parser) void { - p.state = State.Simple; + p.state = .Simple; p.stack.shrink(0); } @@ -1118,8 +1220,8 @@ pub const Parser = struct { // can be cleaned up on error correctly during a `parse` on call. fn transition(p: *Parser, allocator: *Allocator, input: []const u8, i: usize, token: Token) !void { switch (p.state) { - State.ObjectKey => switch (token.id) { - Token.Id.ObjectEnd => { + .ObjectKey => switch (token) { + .ObjectEnd => { if (p.stack.len == 1) { return; } @@ -1127,9 +1229,9 @@ pub const Parser = struct { var value = p.stack.pop(); try p.pushToParent(&value); }, - Token.Id.String => { - try p.stack.append(try p.parseString(allocator, token, input, i)); - p.state = State.ObjectValue; + .String => |s| { + try p.stack.append(try p.parseString(allocator, s, input, i)); + p.state = .ObjectValue; }, else => { // The streaming parser would return an error eventually. @@ -1138,54 +1240,54 @@ pub const Parser = struct { return error.InvalidLiteral; }, }, - State.ObjectValue => { + .ObjectValue => { var object = &p.stack.items[p.stack.len - 2].Object; var key = p.stack.items[p.stack.len - 1].String; - switch (token.id) { - Token.Id.ObjectBegin => { + switch (token) { + .ObjectBegin => { try p.stack.append(Value{ .Object = ObjectMap.init(allocator) }); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.ArrayBegin => { + .ArrayBegin => { try p.stack.append(Value{ .Array = Array.init(allocator) }); - p.state = State.ArrayValue; + p.state = .ArrayValue; }, - Token.Id.String => { - _ = try object.put(key, try p.parseString(allocator, token, input, i)); + .String => |s| { + _ = try object.put(key, try p.parseString(allocator, s, input, i)); _ = p.stack.pop(); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.Number => { - _ = try object.put(key, try p.parseNumber(token, input, i)); + .Number => |n| { + _ = try object.put(key, try p.parseNumber(n, input, i)); _ = p.stack.pop(); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.True => { + .True => { _ = try object.put(key, Value{ .Bool = true }); _ = p.stack.pop(); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.False => { + .False => { _ = try object.put(key, Value{ .Bool = false }); _ = p.stack.pop(); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.Null => { + .Null => { _ = try object.put(key, Value.Null); _ = p.stack.pop(); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.ObjectEnd, Token.Id.ArrayEnd => { + .ObjectEnd, .ArrayEnd => { unreachable; }, } }, - State.ArrayValue => { + .ArrayValue => { var array = &p.stack.items[p.stack.len - 1].Array; - switch (token.id) { - Token.Id.ArrayEnd => { + switch (token) { + .ArrayEnd => { if (p.stack.len == 1) { return; } @@ -1193,59 +1295,59 @@ pub const Parser = struct { var value = p.stack.pop(); try p.pushToParent(&value); }, - Token.Id.ObjectBegin => { + .ObjectBegin => { try p.stack.append(Value{ .Object = ObjectMap.init(allocator) }); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.ArrayBegin => { + .ArrayBegin => { try p.stack.append(Value{ .Array = Array.init(allocator) }); - p.state = State.ArrayValue; + p.state = .ArrayValue; }, - Token.Id.String => { - try array.append(try p.parseString(allocator, token, input, i)); + .String => |s| { + try array.append(try p.parseString(allocator, s, input, i)); }, - Token.Id.Number => { - try array.append(try p.parseNumber(token, input, i)); + .Number => |n| { + try array.append(try p.parseNumber(n, input, i)); }, - Token.Id.True => { + .True => { try array.append(Value{ .Bool = true }); }, - Token.Id.False => { + .False => { try array.append(Value{ .Bool = false }); }, - Token.Id.Null => { + .Null => { try array.append(Value.Null); }, - Token.Id.ObjectEnd => { + .ObjectEnd => { unreachable; }, } }, - State.Simple => switch (token.id) { - Token.Id.ObjectBegin => { + .Simple => switch (token) { + .ObjectBegin => { try p.stack.append(Value{ .Object = ObjectMap.init(allocator) }); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, - Token.Id.ArrayBegin => { + .ArrayBegin => { try p.stack.append(Value{ .Array = Array.init(allocator) }); - p.state = State.ArrayValue; + p.state = .ArrayValue; }, - Token.Id.String => { - try p.stack.append(try p.parseString(allocator, token, input, i)); + .String => |s| { + try p.stack.append(try p.parseString(allocator, s, input, i)); }, - Token.Id.Number => { - try p.stack.append(try p.parseNumber(token, input, i)); + .Number => |n| { + try p.stack.append(try p.parseNumber(n, input, i)); }, - Token.Id.True => { + .True => { try p.stack.append(Value{ .Bool = true }); }, - Token.Id.False => { + .False => { try p.stack.append(Value{ .Bool = false }); }, - Token.Id.Null => { + .Null => { try p.stack.append(Value.Null); }, - Token.Id.ObjectEnd, Token.Id.ArrayEnd => { + .ObjectEnd, .ArrayEnd => { unreachable; }, }, @@ -1260,12 +1362,12 @@ pub const Parser = struct { var object = &p.stack.items[p.stack.len - 1].Object; _ = try object.put(key, value.*); - p.state = State.ObjectKey; + p.state = .ObjectKey; }, // Array Parent -> [ ..., <array>, value ] Value.Array => |*array| { try array.append(value.*); - p.state = State.ArrayValue; + p.state = .ArrayValue; }, else => { unreachable; @@ -1273,80 +1375,78 @@ pub const Parser = struct { } } - fn parseString(p: *Parser, allocator: *Allocator, token: Token, input: []const u8, i: usize) !Value { + fn parseString(p: *Parser, allocator: *Allocator, s: std.meta.TagPayloadType(Token, Token.String), input: []const u8, i: usize) !Value { // TODO: We don't strictly have to copy values which do not contain any escape // characters if flagged with the option. - const slice = token.slice(input, i); - return Value{ .String = try unescapeStringAlloc(allocator, slice) }; + const slice = s.slice(input, i); + switch (s.escapes) { + .None => return Value{ .String = try mem.dupe(allocator, u8, slice) }, + .Some => |some_escapes| { + const output = try allocator.alloc(u8, s.decodedLength()); + errdefer allocator.free(output); + try unescapeString(output, slice); + return Value{ .String = output }; + }, + } } - fn parseNumber(p: *Parser, token: Token, input: []const u8, i: usize) !Value { - return if (token.number_is_integer) - Value{ .Integer = try std.fmt.parseInt(i64, token.slice(input, i), 10) } + fn parseNumber(p: *Parser, n: std.meta.TagPayloadType(Token, Token.Number), input: []const u8, i: usize) !Value { + return if (n.is_integer) + Value{ .Integer = try std.fmt.parseInt(i64, n.slice(input, i), 10) } else - Value{ .Float = try std.fmt.parseFloat(f64, token.slice(input, i)) }; + Value{ .Float = try std.fmt.parseFloat(f64, n.slice(input, i)) }; } }; // Unescape a JSON string // Only to be used on strings already validated by the parser // (note the unreachable statements and lack of bounds checking) -// Optimized for arena allocators, uses Allocator.shrink -// -// Idea: count how many bytes we will need to allocate in the streaming parser and store it -// in the token to avoid allocating too much memory or iterating through the string again -// Downside: need to find how many bytes a unicode escape sequence will produce twice -fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 { - const output = try alloc.alloc(u8, input.len); - errdefer alloc.free(output); - +fn unescapeString(output: []u8, input: []const u8) !void { var inIndex: usize = 0; var outIndex: usize = 0; - while(inIndex < input.len) { - if(input[inIndex] != '\\'){ + while (inIndex < input.len) { + if (input[inIndex] != '\\') { // not an escape sequence output[outIndex] = input[inIndex]; inIndex += 1; outIndex += 1; - } else if(input[inIndex + 1] != 'u'){ + } else if (input[inIndex + 1] != 'u') { // a simple escape sequence - output[outIndex] = @as(u8, - switch(input[inIndex + 1]){ - '\\' => '\\', - '/' => '/', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'f' => 12, - 'b' => 8, - '"' => '"', - else => unreachable - } - ); + output[outIndex] = @as(u8, switch (input[inIndex + 1]) { + '\\' => '\\', + '/' => '/', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'f' => 12, + 'b' => 8, + '"' => '"', + else => unreachable, + }); inIndex += 2; outIndex += 1; } else { // a unicode escape sequence - const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex+2 .. inIndex+6], 16) catch unreachable; + const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex + 2 .. inIndex + 6], 16) catch unreachable; // guess optimistically that it's not a surrogate pair - if(std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| { + if (std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| { outIndex += byteCount; inIndex += 6; } else |err| { // it might be a surrogate pair - if(err != error.Utf8CannotEncodeSurrogateHalf) { + if (err != error.Utf8CannotEncodeSurrogateHalf) { return error.InvalidUnicodeHexSymbol; } // check if a second code unit is present - if(inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u'){ + if (inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u') { return error.InvalidUnicodeHexSymbol; } - - const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex+8 .. inIndex+12], 16) catch unreachable; - - if(std.unicode.utf16leToUtf8(output[outIndex..], &[2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| { + + const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex + 8 .. inIndex + 12], 16) catch unreachable; + + if (std.unicode.utf16leToUtf8(output[outIndex..], &[2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| { outIndex += byteCount; inIndex += 12; } else |_| { @@ -1355,8 +1455,7 @@ fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 { } } } - - return alloc.shrink(output, outIndex); + assert(outIndex == output.len); } test "json.parser.dynamic" { diff --git a/lib/std/meta.zig b/lib/std/meta.zig index e0ddbed274..5e5850e393 100644 --- a/lib/std/meta.zig +++ b/lib/std/meta.zig @@ -364,10 +364,8 @@ test "std.meta.activeTag" { ///Given a tagged union type, and an enum, return the type of the union /// field corresponding to the enum tag. -pub fn TagPayloadType(comptime U: type, tag: var) type { - const Tag = @TypeOf(tag); +pub fn TagPayloadType(comptime U: type, tag: @TagType(U)) type { testing.expect(trait.is(builtin.TypeId.Union)(U)); - testing.expect(trait.is(builtin.TypeId.Enum)(Tag)); const info = @typeInfo(U).Union; |
