aboutsummaryrefslogtreecommitdiff
path: root/lib/std/json.zig
diff options
context:
space:
mode:
authorAndrew Kelley <andrew@ziglang.org>2019-12-29 18:31:10 -0500
committerGitHub <noreply@github.com>2019-12-29 18:31:10 -0500
commit54231e832bae780c5012fc5cd30932447f1e1d47 (patch)
tree4475e1625e81f320e8ffe358f4ee8dca557569ba /lib/std/json.zig
parent6af39aa49afeb3498d6c5dfa0b60a0fdc15ca47c (diff)
parent6d3b95a708c87ed81bd4f270b956913d82a0972d (diff)
downloadzig-54231e832bae780c5012fc5cd30932447f1e1d47.tar.gz
zig-54231e832bae780c5012fc5cd30932447f1e1d47.zip
Merge pull request #3648 from xackus/json-unescape
breaking: JSON unescape
Diffstat (limited to 'lib/std/json.zig')
-rw-r--r--lib/std/json.zig169
1 files changed, 142 insertions, 27 deletions
diff --git a/lib/std/json.zig b/lib/std/json.zig
index 030b940276..8d2d7d7477 100644
--- a/lib/std/json.zig
+++ b/lib/std/json.zig
@@ -10,18 +10,18 @@ const maxInt = std.math.maxInt;
pub const WriteStream = @import("json/write_stream.zig").WriteStream;
-// A single token slice into the parent string.
-//
-// Use `token.slice()` on the input at the current position to get the current slice.
+/// A single token slice into the parent string.
+///
+/// Use `token.slice()` on the input at the current position to get the current slice.
pub const Token = struct {
id: Id,
- // How many bytes do we skip before counting
+ /// How many bytes do we skip before counting
offset: u1,
- // Whether string contains a \uXXXX sequence and cannot be zero-copied
+ /// Whether string contains an escape sequence and cannot be zero-copied
string_has_escape: bool,
- // Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
+ /// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
number_is_integer: bool,
- // How many bytes from the current position behind the start of this token is.
+ /// How many bytes from the current position behind the start of this token is.
count: usize,
pub const Id = enum {
@@ -66,7 +66,7 @@ pub const Token = struct {
};
}
- // A marker token is a zero-length
+ /// A marker token is a zero-length
pub fn initMarker(id: Id) Token {
return Token{
.id = id,
@@ -77,19 +77,19 @@ pub const Token = struct {
};
}
- // Slice into the underlying input string.
+ /// Slice into the underlying input string.
pub fn slice(self: Token, input: []const u8, i: usize) []const u8 {
return input[i + self.offset - self.count .. i + self.offset];
}
};
-// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
-// they are encountered. No copies or allocations are performed during parsing and the entire
-// parsing state requires ~40-50 bytes of stack space.
-//
-// Conforms strictly to RFC8529.
-//
-// For a non-byte based wrapper, consider using TokenStream instead.
+/// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
+/// they are encountered. No copies or allocations are performed during parsing and the entire
+/// parsing state requires ~40-50 bytes of stack space.
+///
+/// Conforms strictly to RFC8529.
+///
+/// For a non-byte based wrapper, consider using TokenStream instead.
pub const StreamingParser = struct {
// Current state
state: State,
@@ -205,10 +205,10 @@ pub const StreamingParser = struct {
InvalidControlCharacter,
};
- // Give another byte to the parser and obtain any new tokens. This may (rarely) return two
- // tokens. token2 is always null if token1 is null.
- //
- // There is currently no error recovery on a bad stream.
+ /// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
+ /// tokens. token2 is always null if token1 is null.
+ ///
+ /// There is currently no error recovery on a bad stream.
pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
token1.* = null;
token2.* = null;
@@ -866,7 +866,7 @@ pub const StreamingParser = struct {
}
};
-// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
+/// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
pub const TokenStream = struct {
i: usize,
slice: []const u8,
@@ -905,7 +905,13 @@ pub const TokenStream = struct {
}
}
- if (self.parser.complete) {
+ // Without this a bare number fails, becasue the streaming parser doesn't know it ended
+ try self.parser.feed(' ', &t1, &t2);
+ self.i += 1;
+
+ if (t1) |token| {
+ return token;
+ } else if (self.parser.complete) {
return null;
} else {
return error.UnexpectedEndOfJson;
@@ -971,8 +977,8 @@ test "json.token" {
testing.expect((try p.next()) == null);
}
-// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
-// be able to decode the string even if this returns true.
+/// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
+/// be able to decode the string even if this returns true.
pub fn validate(s: []const u8) bool {
var p = StreamingParser.init();
@@ -1009,6 +1015,8 @@ pub const ValueTree = struct {
pub const ObjectMap = StringHashMap(Value);
pub const Array = ArrayList(Value);
+/// Represents a JSON value
+/// Currently only supports numbers that fit into i64 or f64.
pub const Value = union(enum) {
Null,
Bool: bool,
@@ -1055,7 +1063,7 @@ pub const Value = union(enum) {
}
};
-// A non-stream JSON parser which constructs a tree of Value's.
+/// A non-stream JSON parser which constructs a tree of Value's.
pub const Parser = struct {
allocator: *Allocator,
state: State,
@@ -1124,7 +1132,10 @@ pub const Parser = struct {
p.state = State.ObjectValue;
},
else => {
- unreachable;
+ // The streaming parser would return an error eventually.
+ // To prevent invalid state we return an error now.
+ // TODO make the streaming parser return an error as soon as it encounters an invalid object key
+ return error.InvalidLiteral;
},
},
State.ObjectValue => {
@@ -1266,7 +1277,7 @@ pub const Parser = struct {
// TODO: We don't strictly have to copy values which do not contain any escape
// characters if flagged with the option.
const slice = token.slice(input, i);
- return Value{ .String = try mem.dupe(allocator, u8, slice) };
+ return Value{ .String = try unescapeStringAlloc(allocator, slice) };
}
fn parseNumber(p: *Parser, token: Token, input: []const u8, i: usize) !Value {
@@ -1277,6 +1288,77 @@ pub const Parser = struct {
}
};
+// Unescape a JSON string
+// Only to be used on strings already validated by the parser
+// (note the unreachable statements and lack of bounds checking)
+// Optimized for arena allocators, uses Allocator.shrink
+//
+// Idea: count how many bytes we will need to allocate in the streaming parser and store it
+// in the token to avoid allocating too much memory or iterating through the string again
+// Downside: need to find how many bytes a unicode escape sequence will produce twice
+fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 {
+ const output = try alloc.alloc(u8, input.len);
+ errdefer alloc.free(output);
+
+ var inIndex: usize = 0;
+ var outIndex: usize = 0;
+
+ while(inIndex < input.len) {
+ if(input[inIndex] != '\\'){
+ // not an escape sequence
+ output[outIndex] = input[inIndex];
+ inIndex += 1;
+ outIndex += 1;
+ } else if(input[inIndex + 1] != 'u'){
+ // a simple escape sequence
+ output[outIndex] = @as(u8,
+ switch(input[inIndex + 1]){
+ '\\' => '\\',
+ '/' => '/',
+ 'n' => '\n',
+ 'r' => '\r',
+ 't' => '\t',
+ 'f' => 12,
+ 'b' => 8,
+ '"' => '"',
+ else => unreachable
+ }
+ );
+ inIndex += 2;
+ outIndex += 1;
+ } else {
+ // a unicode escape sequence
+ const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex+2 .. inIndex+6], 16) catch unreachable;
+
+ // guess optimistically that it's not a surrogate pair
+ if(std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| {
+ outIndex += byteCount;
+ inIndex += 6;
+ } else |err| {
+ // it might be a surrogate pair
+ if(err != error.Utf8CannotEncodeSurrogateHalf) {
+ return error.InvalidUnicodeHexSymbol;
+ }
+ // check if a second code unit is present
+ if(inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u'){
+ return error.InvalidUnicodeHexSymbol;
+ }
+
+ const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex+8 .. inIndex+12], 16) catch unreachable;
+
+ if(std.unicode.utf16leToUtf8(output[outIndex..], [2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| {
+ outIndex += byteCount;
+ inIndex += 12;
+ } else |_| {
+ return error.InvalidUnicodeHexSymbol;
+ }
+ }
+ }
+ }
+
+ return alloc.shrink(output, outIndex);
+}
+
test "json.parser.dynamic" {
var p = Parser.init(debug.global_allocator, false);
defer p.deinit();
@@ -1399,3 +1481,36 @@ test "integer after float has proper type" {
);
std.testing.expect(json.Object.getValue("ints").?.Array.at(0) == .Integer);
}
+
+test "escaped characters" {
+ const input =
+ \\{
+ \\ "backslash": "\\",
+ \\ "forwardslash": "\/",
+ \\ "newline": "\n",
+ \\ "carriagereturn": "\r",
+ \\ "tab": "\t",
+ \\ "formfeed": "\f",
+ \\ "backspace": "\b",
+ \\ "doublequote": "\"",
+ \\ "unicode": "\u0105",
+ \\ "surrogatepair": "\ud83d\ude02"
+ \\}
+ ;
+
+ var p = Parser.init(debug.global_allocator, false);
+ const tree = try p.parse(input);
+
+ const obj = tree.root.Object;
+
+ testing.expectEqualSlices(u8, obj.get("backslash").?.value.String, "\\");
+ testing.expectEqualSlices(u8, obj.get("forwardslash").?.value.String, "/");
+ testing.expectEqualSlices(u8, obj.get("newline").?.value.String, "\n");
+ testing.expectEqualSlices(u8, obj.get("carriagereturn").?.value.String, "\r");
+ testing.expectEqualSlices(u8, obj.get("tab").?.value.String, "\t");
+ testing.expectEqualSlices(u8, obj.get("formfeed").?.value.String, "\x0C");
+ testing.expectEqualSlices(u8, obj.get("backspace").?.value.String, "\x08");
+ testing.expectEqualSlices(u8, obj.get("doublequote").?.value.String, "\"");
+ testing.expectEqualSlices(u8, obj.get("unicode").?.value.String, "ą");
+ testing.expectEqualSlices(u8, obj.get("surrogatepair").?.value.String, "😂");
+}