diff options
Diffstat (limited to 'src/Package')
| -rw-r--r-- | src/Package/Fetch.zig | 1557 | ||||
| -rw-r--r-- | src/Package/Fetch/git.zig | 1466 | ||||
| -rw-r--r-- | src/Package/Fetch/git/testdata/testrepo.idx | bin | 0 -> 3480 bytes | |||
| -rw-r--r-- | src/Package/Fetch/git/testdata/testrepo.pack | bin | 0 -> 6511 bytes | |||
| -rw-r--r-- | src/Package/Manifest.zig | 566 | ||||
| -rw-r--r-- | src/Package/Module.zig | 34 | ||||
| -rw-r--r-- | src/Package/hash.zig | 153 |
7 files changed, 3623 insertions, 153 deletions
diff --git a/src/Package/Fetch.zig b/src/Package/Fetch.zig new file mode 100644 index 0000000000..3aae5a533f --- /dev/null +++ b/src/Package/Fetch.zig @@ -0,0 +1,1557 @@ +//! Represents one independent job whose responsibility is to: +//! +//! 1. Check the global zig package cache to see if the hash already exists. +//! If so, load, parse, and validate the build.zig.zon file therein, and +//! goto step 8. Likewise if the location is a relative path, treat this +//! the same as a cache hit. Otherwise, proceed. +//! 2. Fetch and unpack a URL into a temporary directory. +//! 3. Load, parse, and validate the build.zig.zon file therein. It is allowed +//! for the file to be missing, in which case this fetched package is considered +//! to be a "naked" package. +//! 4. Apply inclusion rules of the build.zig.zon to the temporary directory by +//! deleting excluded files. If any files had errors for files that were +//! ultimately excluded, those errors should be ignored, such as failure to +//! create symlinks that weren't supposed to be included anyway. +//! 5. Compute the package hash based on the remaining files in the temporary +//! directory. +//! 6. Rename the temporary directory into the global zig package cache +//! directory. If the hash already exists, delete the temporary directory and +//! leave the zig package cache directory untouched as it may be in use by the +//! system. This is done even if the hash is invalid, in case the package with +//! the different hash is used in the future. +//! 7. Validate the computed hash against the expected hash. If invalid, +//! this job is done. +//! 8. Spawn a new fetch job for each dependency in the manifest file. Use +//! a mutex and a hash map so that redundant jobs do not get queued up. +//! +//! All of this must be done with only referring to the state inside this struct +//! because this work will be done in a dedicated thread. + +arena: std.heap.ArenaAllocator, +location: Location, +location_tok: std.zig.Ast.TokenIndex, +hash_tok: std.zig.Ast.TokenIndex, +parent_package_root: Package.Path, +parent_manifest_ast: ?*const std.zig.Ast, +prog_node: *std.Progress.Node, +job_queue: *JobQueue, +/// If true, don't add an error for a missing hash. This flag is not passed +/// down to recursive dependencies. It's intended to be used only be the CLI. +omit_missing_hash_error: bool, +/// If true, don't fail when a manifest file is missing the `paths` field, +/// which specifies inclusion rules. This is intended to be true for the first +/// fetch task and false for the recursive dependencies. +allow_missing_paths_field: bool, + +// Above this are fields provided as inputs to `run`. +// Below this are fields populated by `run`. + +/// This will either be relative to `global_cache`, or to the build root of +/// the root package. +package_root: Package.Path, +error_bundle: ErrorBundle.Wip, +manifest: ?Manifest, +manifest_ast: std.zig.Ast, +actual_hash: Manifest.Digest, +/// Fetch logic notices whether a package has a build.zig file and sets this flag. +has_build_zig: bool, +/// Indicates whether the task aborted due to an out-of-memory condition. +oom_flag: bool, + +// This field is used by the CLI only, untouched by this file. + +/// The module for this `Fetch` tasks's package, which exposes `build.zig` as +/// the root source file. +module: ?*Package.Module, + +/// Contains shared state among all `Fetch` tasks. +pub const JobQueue = struct { + mutex: std.Thread.Mutex = .{}, + /// It's an array hash map so that it can be sorted before rendering the + /// dependencies.zig source file. + /// Protected by `mutex`. + table: Table = .{}, + /// `table` may be missing some tasks such as ones that failed, so this + /// field contains references to all of them. + /// Protected by `mutex`. + all_fetches: std.ArrayListUnmanaged(*Fetch) = .{}, + + http_client: *std.http.Client, + thread_pool: *ThreadPool, + wait_group: WaitGroup = .{}, + global_cache: Cache.Directory, + recursive: bool, + work_around_btrfs_bug: bool, + + pub const Table = std.AutoArrayHashMapUnmanaged(Manifest.MultiHashHexDigest, *Fetch); + + pub fn deinit(jq: *JobQueue) void { + if (jq.all_fetches.items.len == 0) return; + const gpa = jq.all_fetches.items[0].arena.child_allocator; + jq.table.deinit(gpa); + // These must be deinitialized in reverse order because subsequent + // `Fetch` instances are allocated in prior ones' arenas. + // Sorry, I know it's a bit weird, but it slightly simplifies the + // critical section. + while (jq.all_fetches.popOrNull()) |f| f.deinit(); + jq.all_fetches.deinit(gpa); + jq.* = undefined; + } + + /// Dumps all subsequent error bundles into the first one. + pub fn consolidateErrors(jq: *JobQueue) !void { + const root = &jq.all_fetches.items[0].error_bundle; + const gpa = root.gpa; + for (jq.all_fetches.items[1..]) |fetch| { + if (fetch.error_bundle.root_list.items.len > 0) { + var bundle = try fetch.error_bundle.toOwnedBundle(""); + defer bundle.deinit(gpa); + try root.addBundleAsRoots(bundle); + } + } + } + + /// Creates the dependencies.zig source code for the build runner to obtain + /// via `@import("@dependencies")`. + pub fn createDependenciesSource(jq: *JobQueue, buf: *std.ArrayList(u8)) Allocator.Error!void { + const keys = jq.table.keys(); + + assert(keys.len != 0); // caller should have added the first one + if (keys.len == 1) { + // This is the first one. It must have no dependencies. + return createEmptyDependenciesSource(buf); + } + + try buf.appendSlice("pub const packages = struct {\n"); + + // Ensure the generated .zig file is deterministic. + jq.table.sortUnstable(@as(struct { + keys: []const Manifest.MultiHashHexDigest, + pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool { + return std.mem.lessThan(u8, &ctx.keys[a_index], &ctx.keys[b_index]); + } + }, .{ .keys = keys })); + + for (keys, jq.table.values()) |hash, fetch| { + if (fetch == jq.all_fetches.items[0]) { + // The first one is a dummy package for the current project. + continue; + } + try buf.writer().print( + \\ pub const {} = struct {{ + \\ pub const build_root = "{q}"; + \\ + , .{ std.zig.fmtId(&hash), fetch.package_root }); + + if (fetch.has_build_zig) { + try buf.writer().print( + \\ pub const build_zig = @import("{}"); + \\ + , .{std.zig.fmtEscapes(&hash)}); + } + + if (fetch.manifest) |*manifest| { + try buf.appendSlice( + \\ pub const deps: []const struct { []const u8, []const u8 } = &.{ + \\ + ); + for (manifest.dependencies.keys(), manifest.dependencies.values()) |name, dep| { + const h = depDigest(fetch.package_root, jq.global_cache, dep) orelse continue; + try buf.writer().print( + " .{{ \"{}\", \"{}\" }},\n", + .{ std.zig.fmtEscapes(name), std.zig.fmtEscapes(&h) }, + ); + } + + try buf.appendSlice( + \\ }; + \\ }; + \\ + ); + } else { + try buf.appendSlice( + \\ pub const deps: []const struct { []const u8, []const u8 } = &.{}; + \\ }; + \\ + ); + } + } + + try buf.appendSlice( + \\}; + \\ + \\pub const root_deps: []const struct { []const u8, []const u8 } = &.{ + \\ + ); + + const root_fetch = jq.all_fetches.items[0]; + const root_manifest = &root_fetch.manifest.?; + + for (root_manifest.dependencies.keys(), root_manifest.dependencies.values()) |name, dep| { + const h = depDigest(root_fetch.package_root, jq.global_cache, dep) orelse continue; + try buf.writer().print( + " .{{ \"{}\", \"{}\" }},\n", + .{ std.zig.fmtEscapes(name), std.zig.fmtEscapes(&h) }, + ); + } + try buf.appendSlice("};\n"); + } + + pub fn createEmptyDependenciesSource(buf: *std.ArrayList(u8)) Allocator.Error!void { + try buf.appendSlice( + \\pub const packages = struct {}; + \\pub const root_deps: []const struct { []const u8, []const u8 } = &.{}; + \\ + ); + } +}; + +pub const Location = union(enum) { + remote: Remote, + /// A directory found inside the parent package. + relative_path: Package.Path, + /// Recursive Fetch tasks will never use this Location, but it may be + /// passed in by the CLI. Indicates the file contents here should be copied + /// into the global package cache. It may be a file relative to the cwd or + /// absolute, in which case it should be treated exactly like a `file://` + /// URL, or a directory, in which case it should be treated as an + /// already-unpacked directory (but still needs to be copied into the + /// global package cache and have inclusion rules applied). + path_or_url: []const u8, + + pub const Remote = struct { + url: []const u8, + /// If this is null it means the user omitted the hash field from a dependency. + /// It will be an error but the logic should still fetch and print the discovered hash. + hash: ?Manifest.MultiHashHexDigest, + }; +}; + +pub const RunError = error{ + OutOfMemory, + /// This error code is intended to be handled by inspecting the + /// `error_bundle` field. + FetchFailed, +}; + +pub fn run(f: *Fetch) RunError!void { + const eb = &f.error_bundle; + const arena = f.arena.allocator(); + const gpa = f.arena.child_allocator; + const cache_root = f.job_queue.global_cache; + + try eb.init(gpa); + + // Check the global zig package cache to see if the hash already exists. If + // so, load, parse, and validate the build.zig.zon file therein, and skip + // ahead to queuing up jobs for dependencies. Likewise if the location is a + // relative path, treat this the same as a cache hit. Otherwise, proceed. + + const remote = switch (f.location) { + .relative_path => |pkg_root| { + if (fs.path.isAbsolute(pkg_root.sub_path)) return f.fail( + f.location_tok, + try eb.addString("expected path relative to build root; found absolute path"), + ); + if (f.hash_tok != 0) return f.fail( + f.hash_tok, + try eb.addString("path-based dependencies are not hashed"), + ); + if (std.mem.startsWith(u8, pkg_root.sub_path, "../") or + std.mem.eql(u8, pkg_root.sub_path, "..")) + { + return f.fail( + f.location_tok, + try eb.printString("dependency path outside project: '{}{s}'", .{ + pkg_root.root_dir, pkg_root.sub_path, + }), + ); + } + f.package_root = pkg_root; + try loadManifest(f, pkg_root); + if (!f.has_build_zig) try checkBuildFileExistence(f); + if (!f.job_queue.recursive) return; + return queueJobsForDeps(f); + }, + .remote => |remote| remote, + .path_or_url => |path_or_url| { + if (fs.cwd().openIterableDir(path_or_url, .{})) |dir| { + var resource: Resource = .{ .dir = dir }; + return runResource(f, path_or_url, &resource, null); + } else |dir_err| { + const file_err = if (dir_err == error.NotDir) e: { + if (fs.cwd().openFile(path_or_url, .{})) |file| { + var resource: Resource = .{ .file = file }; + return runResource(f, path_or_url, &resource, null); + } else |err| break :e err; + } else dir_err; + + const uri = std.Uri.parse(path_or_url) catch |uri_err| { + return f.fail(0, try eb.printString( + "'{s}' could not be recognized as a file path ({s}) or an URL ({s})", + .{ path_or_url, @errorName(file_err), @errorName(uri_err) }, + )); + }; + var resource = try f.initResource(uri); + return runResource(f, uri.path, &resource, null); + } + }, + }; + + const s = fs.path.sep_str; + if (remote.hash) |expected_hash| { + const pkg_sub_path = "p" ++ s ++ expected_hash; + if (cache_root.handle.access(pkg_sub_path, .{})) |_| { + f.package_root = .{ + .root_dir = cache_root, + .sub_path = try arena.dupe(u8, pkg_sub_path), + }; + try loadManifest(f, f.package_root); + try checkBuildFileExistence(f); + if (!f.job_queue.recursive) return; + return queueJobsForDeps(f); + } else |err| switch (err) { + error.FileNotFound => {}, + else => |e| { + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to open global package cache directory '{}{s}': {s}", .{ + cache_root, pkg_sub_path, @errorName(e), + }), + }); + return error.FetchFailed; + }, + } + } + + // Fetch and unpack the remote into a temporary directory. + + const uri = std.Uri.parse(remote.url) catch |err| return f.fail( + f.location_tok, + try eb.printString("invalid URI: {s}", .{@errorName(err)}), + ); + var resource = try f.initResource(uri); + return runResource(f, uri.path, &resource, remote.hash); +} + +pub fn deinit(f: *Fetch) void { + f.error_bundle.deinit(); + f.arena.deinit(); +} + +/// Consumes `resource`, even if an error is returned. +fn runResource( + f: *Fetch, + uri_path: []const u8, + resource: *Resource, + remote_hash: ?Manifest.MultiHashHexDigest, +) RunError!void { + defer resource.deinit(); + const arena = f.arena.allocator(); + const eb = &f.error_bundle; + const s = fs.path.sep_str; + const cache_root = f.job_queue.global_cache; + const rand_int = std.crypto.random.int(u64); + const tmp_dir_sub_path = "tmp" ++ s ++ Manifest.hex64(rand_int); + + const tmp_directory_path = try cache_root.join(arena, &.{tmp_dir_sub_path}); + var tmp_directory: Cache.Directory = .{ + .path = tmp_directory_path, + .handle = handle: { + const dir = cache_root.handle.makeOpenPathIterable(tmp_dir_sub_path, .{}) catch |err| { + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to create temporary directory '{s}': {s}", .{ + tmp_directory_path, @errorName(err), + }), + }); + return error.FetchFailed; + }; + break :handle dir.dir; + }, + }; + defer tmp_directory.handle.close(); + + try unpackResource(f, resource, uri_path, tmp_directory); + + // Load, parse, and validate the unpacked build.zig.zon file. It is allowed + // for the file to be missing, in which case this fetched package is + // considered to be a "naked" package. + try loadManifest(f, .{ .root_dir = tmp_directory }); + + // Apply the manifest's inclusion rules to the temporary directory by + // deleting excluded files. If any error occurred for files that were + // ultimately excluded, those errors should be ignored, such as failure to + // create symlinks that weren't supposed to be included anyway. + + // Empty directories have already been omitted by `unpackResource`. + + const filter: Filter = .{ + .include_paths = if (f.manifest) |m| m.paths else .{}, + }; + + // Compute the package hash based on the remaining files in the temporary + // directory. + + if (builtin.os.tag == .linux and f.job_queue.work_around_btrfs_bug) { + // https://github.com/ziglang/zig/issues/17095 + tmp_directory.handle.close(); + const iterable_dir = cache_root.handle.makeOpenPathIterable(tmp_dir_sub_path, .{}) catch + @panic("btrfs workaround failed"); + tmp_directory.handle = iterable_dir.dir; + } + + f.actual_hash = try computeHash(f, tmp_directory, filter); + + // Rename the temporary directory into the global zig package cache + // directory. If the hash already exists, delete the temporary directory + // and leave the zig package cache directory untouched as it may be in use + // by the system. This is done even if the hash is invalid, in case the + // package with the different hash is used in the future. + + f.package_root = .{ + .root_dir = cache_root, + .sub_path = try arena.dupe(u8, "p" ++ s ++ Manifest.hexDigest(f.actual_hash)), + }; + renameTmpIntoCache(cache_root.handle, tmp_dir_sub_path, f.package_root.sub_path) catch |err| { + const src = try cache_root.join(arena, &.{tmp_dir_sub_path}); + const dest = try cache_root.join(arena, &.{f.package_root.sub_path}); + try eb.addRootErrorMessage(.{ .msg = try eb.printString( + "unable to rename temporary directory '{s}' into package cache directory '{s}': {s}", + .{ src, dest, @errorName(err) }, + ) }); + return error.FetchFailed; + }; + + // Validate the computed hash against the expected hash. If invalid, this + // job is done. + + const actual_hex = Manifest.hexDigest(f.actual_hash); + if (remote_hash) |declared_hash| { + if (!std.mem.eql(u8, &declared_hash, &actual_hex)) { + return f.fail(f.hash_tok, try eb.printString( + "hash mismatch: manifest declares {s} but the fetched package has {s}", + .{ declared_hash, actual_hex }, + )); + } + } else if (!f.omit_missing_hash_error) { + const notes_len = 1; + try eb.addRootErrorMessage(.{ + .msg = try eb.addString("dependency is missing hash field"), + .src_loc = try f.srcLoc(f.location_tok), + .notes_len = notes_len, + }); + const notes_start = try eb.reserveNotes(notes_len); + eb.extra.items[notes_start] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("expected .hash = \"{s}\",", .{&actual_hex}), + })); + return error.FetchFailed; + } + + // Spawn a new fetch job for each dependency in the manifest file. Use + // a mutex and a hash map so that redundant jobs do not get queued up. + if (!f.job_queue.recursive) return; + return queueJobsForDeps(f); +} + +/// `computeHash` gets a free check for the existence of `build.zig`, but when +/// not computing a hash, we need to do a syscall to check for it. +fn checkBuildFileExistence(f: *Fetch) RunError!void { + const eb = &f.error_bundle; + if (f.package_root.access(Package.build_zig_basename, .{})) |_| { + f.has_build_zig = true; + } else |err| switch (err) { + error.FileNotFound => {}, + else => |e| { + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to access '{}{s}': {s}", .{ + f.package_root, Package.build_zig_basename, @errorName(e), + }), + }); + return error.FetchFailed; + }, + } +} + +/// This function populates `f.manifest` or leaves it `null`. +fn loadManifest(f: *Fetch, pkg_root: Package.Path) RunError!void { + const eb = &f.error_bundle; + const arena = f.arena.allocator(); + const manifest_bytes = pkg_root.root_dir.handle.readFileAllocOptions( + arena, + try fs.path.join(arena, &.{ pkg_root.sub_path, Manifest.basename }), + Manifest.max_bytes, + null, + 1, + 0, + ) catch |err| switch (err) { + error.FileNotFound => return, + else => |e| { + const file_path = try pkg_root.join(arena, Manifest.basename); + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to load package manifest '{}': {s}", .{ + file_path, @errorName(e), + }), + }); + return error.FetchFailed; + }, + }; + + const ast = &f.manifest_ast; + ast.* = try std.zig.Ast.parse(arena, manifest_bytes, .zon); + + if (ast.errors.len > 0) { + const file_path = try std.fmt.allocPrint(arena, "{}" ++ Manifest.basename, .{pkg_root}); + try main.putAstErrorsIntoBundle(arena, ast.*, file_path, eb); + return error.FetchFailed; + } + + f.manifest = try Manifest.parse(arena, ast.*, .{ + .allow_missing_paths_field = f.allow_missing_paths_field, + }); + const manifest = &f.manifest.?; + + if (manifest.errors.len > 0) { + const src_path = try eb.printString("{}{s}", .{ pkg_root, Manifest.basename }); + const token_starts = ast.tokens.items(.start); + + for (manifest.errors) |msg| { + const start_loc = ast.tokenLocation(0, msg.tok); + + try eb.addRootErrorMessage(.{ + .msg = try eb.addString(msg.msg), + .src_loc = try eb.addSourceLocation(.{ + .src_path = src_path, + .span_start = token_starts[msg.tok], + .span_end = @intCast(token_starts[msg.tok] + ast.tokenSlice(msg.tok).len), + .span_main = token_starts[msg.tok] + msg.off, + .line = @intCast(start_loc.line), + .column = @intCast(start_loc.column), + .source_line = try eb.addString(ast.source[start_loc.line_start..start_loc.line_end]), + }), + }); + } + return error.FetchFailed; + } +} + +fn queueJobsForDeps(f: *Fetch) RunError!void { + assert(f.job_queue.recursive); + + // If the package does not have a build.zig.zon file then there are no dependencies. + const manifest = f.manifest orelse return; + + const new_fetches, const prog_names = nf: { + const parent_arena = f.arena.allocator(); + const gpa = f.arena.child_allocator; + const cache_root = f.job_queue.global_cache; + const dep_names = manifest.dependencies.keys(); + const deps = manifest.dependencies.values(); + // Grab the new tasks into a temporary buffer so we can unlock that mutex + // as fast as possible. + // This overallocates any fetches that get skipped by the `continue` in the + // loop below. + const new_fetches = try parent_arena.alloc(Fetch, deps.len); + const prog_names = try parent_arena.alloc([]const u8, deps.len); + var new_fetch_index: usize = 0; + + f.job_queue.mutex.lock(); + defer f.job_queue.mutex.unlock(); + + try f.job_queue.all_fetches.ensureUnusedCapacity(gpa, new_fetches.len); + try f.job_queue.table.ensureUnusedCapacity(gpa, @intCast(new_fetches.len)); + + // There are four cases here: + // * Correct hash is provided by manifest. + // - Hash map already has the entry, no need to add it again. + // * Incorrect hash is provided by manifest. + // - Hash mismatch error emitted; `queueJobsForDeps` is not called. + // * Hash is not provided by manifest. + // - Hash missing error emitted; `queueJobsForDeps` is not called. + // * path-based location is used without a hash. + // - Hash is added to the table based on the path alone before + // calling run(); no need to add it again. + + for (dep_names, deps) |dep_name, dep| { + const new_fetch = &new_fetches[new_fetch_index]; + const location: Location = switch (dep.location) { + .url => |url| .{ .remote = .{ + .url = url, + .hash = h: { + const h = dep.hash orelse break :h null; + const digest_len = @typeInfo(Manifest.MultiHashHexDigest).Array.len; + const multihash_digest = h[0..digest_len].*; + const gop = f.job_queue.table.getOrPutAssumeCapacity(multihash_digest); + if (gop.found_existing) continue; + gop.value_ptr.* = new_fetch; + break :h multihash_digest; + }, + } }, + .path => |rel_path| l: { + // This might produce an invalid path, which is checked for + // at the beginning of run(). + const new_root = try f.package_root.resolvePosix(parent_arena, rel_path); + const multihash_digest = relativePathDigest(new_root, cache_root); + const gop = f.job_queue.table.getOrPutAssumeCapacity(multihash_digest); + if (gop.found_existing) continue; + gop.value_ptr.* = new_fetch; + break :l .{ .relative_path = new_root }; + }, + }; + prog_names[new_fetch_index] = dep_name; + new_fetch_index += 1; + f.job_queue.all_fetches.appendAssumeCapacity(new_fetch); + new_fetch.* = .{ + .arena = std.heap.ArenaAllocator.init(gpa), + .location = location, + .location_tok = dep.location_tok, + .hash_tok = dep.hash_tok, + .parent_package_root = f.package_root, + .parent_manifest_ast = &f.manifest_ast, + .prog_node = f.prog_node, + .job_queue = f.job_queue, + .omit_missing_hash_error = false, + .allow_missing_paths_field = true, + + .package_root = undefined, + .error_bundle = undefined, + .manifest = null, + .manifest_ast = undefined, + .actual_hash = undefined, + .has_build_zig = false, + .oom_flag = false, + + .module = null, + }; + } + + // job_queue mutex is locked so this is OK. + f.prog_node.unprotected_estimated_total_items += new_fetch_index; + + break :nf .{ new_fetches[0..new_fetch_index], prog_names[0..new_fetch_index] }; + }; + + // Now it's time to give tasks to the thread pool. + const thread_pool = f.job_queue.thread_pool; + + for (new_fetches, prog_names) |*new_fetch, prog_name| { + f.job_queue.wait_group.start(); + thread_pool.spawn(workerRun, .{ new_fetch, prog_name }) catch |err| switch (err) { + error.OutOfMemory => { + new_fetch.oom_flag = true; + f.job_queue.wait_group.finish(); + continue; + }, + }; + } +} + +pub fn relativePathDigest( + pkg_root: Package.Path, + cache_root: Cache.Directory, +) Manifest.MultiHashHexDigest { + var hasher = Manifest.Hash.init(.{}); + // This hash is a tuple of: + // * whether it relative to the global cache directory or to the root package + // * the relative file path from there to the build root of the package + hasher.update(if (pkg_root.root_dir.eql(cache_root)) + &package_hash_prefix_cached + else + &package_hash_prefix_project); + hasher.update(pkg_root.sub_path); + return Manifest.hexDigest(hasher.finalResult()); +} + +pub fn workerRun(f: *Fetch, prog_name: []const u8) void { + defer f.job_queue.wait_group.finish(); + + var prog_node = f.prog_node.start(prog_name, 0); + defer prog_node.end(); + prog_node.activate(); + + run(f) catch |err| switch (err) { + error.OutOfMemory => f.oom_flag = true, + error.FetchFailed => { + // Nothing to do because the errors are already reported in `error_bundle`, + // and a reference is kept to the `Fetch` task inside `all_fetches`. + }, + }; +} + +fn srcLoc( + f: *Fetch, + tok: std.zig.Ast.TokenIndex, +) Allocator.Error!ErrorBundle.SourceLocationIndex { + const ast = f.parent_manifest_ast orelse return .none; + const eb = &f.error_bundle; + const token_starts = ast.tokens.items(.start); + const start_loc = ast.tokenLocation(0, tok); + const src_path = try eb.printString("{}" ++ Manifest.basename, .{f.parent_package_root}); + const msg_off = 0; + return eb.addSourceLocation(.{ + .src_path = src_path, + .span_start = token_starts[tok], + .span_end = @intCast(token_starts[tok] + ast.tokenSlice(tok).len), + .span_main = token_starts[tok] + msg_off, + .line = @intCast(start_loc.line), + .column = @intCast(start_loc.column), + .source_line = try eb.addString(ast.source[start_loc.line_start..start_loc.line_end]), + }); +} + +fn fail(f: *Fetch, msg_tok: std.zig.Ast.TokenIndex, msg_str: u32) RunError { + const eb = &f.error_bundle; + try eb.addRootErrorMessage(.{ + .msg = msg_str, + .src_loc = try f.srcLoc(msg_tok), + }); + return error.FetchFailed; +} + +const Resource = union(enum) { + file: fs.File, + http_request: std.http.Client.Request, + git: Git, + dir: fs.IterableDir, + + const Git = struct { + fetch_stream: git.Session.FetchStream, + want_oid: [git.oid_length]u8, + }; + + fn deinit(resource: *Resource) void { + switch (resource.*) { + .file => |*file| file.close(), + .http_request => |*req| req.deinit(), + .git => |*git_resource| git_resource.fetch_stream.deinit(), + .dir => |*dir| dir.close(), + } + resource.* = undefined; + } + + fn reader(resource: *Resource) std.io.AnyReader { + return .{ + .context = resource, + .readFn = read, + }; + } + + fn read(context: *const anyopaque, buffer: []u8) anyerror!usize { + const resource: *Resource = @constCast(@ptrCast(@alignCast(context))); + switch (resource.*) { + .file => |*f| return f.read(buffer), + .http_request => |*r| return r.read(buffer), + .git => |*g| return g.fetch_stream.read(buffer), + .dir => unreachable, + } + } +}; + +const FileType = enum { + tar, + @"tar.gz", + @"tar.xz", + git_pack, + + fn fromPath(file_path: []const u8) ?FileType { + if (ascii.endsWithIgnoreCase(file_path, ".tar")) return .tar; + if (ascii.endsWithIgnoreCase(file_path, ".tar.gz")) return .@"tar.gz"; + if (ascii.endsWithIgnoreCase(file_path, ".tar.xz")) return .@"tar.xz"; + return null; + } + + /// Parameter is a content-disposition header value. + fn fromContentDisposition(cd_header: []const u8) ?FileType { + const attach_end = ascii.indexOfIgnoreCase(cd_header, "attachment;") orelse + return null; + + var value_start = ascii.indexOfIgnoreCasePos(cd_header, attach_end + 1, "filename") orelse + return null; + value_start += "filename".len; + if (cd_header[value_start] == '*') { + value_start += 1; + } + if (cd_header[value_start] != '=') return null; + value_start += 1; + + var value_end = std.mem.indexOfPos(u8, cd_header, value_start, ";") orelse cd_header.len; + if (cd_header[value_end - 1] == '\"') { + value_end -= 1; + } + return fromPath(cd_header[value_start..value_end]); + } + + test fromContentDisposition { + try std.testing.expectEqual(@as(?FileType, .@"tar.gz"), fromContentDisposition("attaChment; FILENAME=\"stuff.tar.gz\"; size=42")); + try std.testing.expectEqual(@as(?FileType, .@"tar.gz"), fromContentDisposition("attachment; filename*=\"stuff.tar.gz\"")); + try std.testing.expectEqual(@as(?FileType, .@"tar.xz"), fromContentDisposition("ATTACHMENT; filename=\"stuff.tar.xz\"")); + try std.testing.expectEqual(@as(?FileType, .@"tar.xz"), fromContentDisposition("attachment; FileName=\"stuff.tar.xz\"")); + try std.testing.expectEqual(@as(?FileType, .@"tar.gz"), fromContentDisposition("attachment; FileName*=UTF-8\'\'xyz%2Fstuff.tar.gz")); + + try std.testing.expect(fromContentDisposition("attachment FileName=\"stuff.tar.gz\"") == null); + try std.testing.expect(fromContentDisposition("attachment; FileName=\"stuff.tar\"") == null); + try std.testing.expect(fromContentDisposition("attachment; FileName\"stuff.gz\"") == null); + try std.testing.expect(fromContentDisposition("attachment; size=42") == null); + try std.testing.expect(fromContentDisposition("inline; size=42") == null); + try std.testing.expect(fromContentDisposition("FileName=\"stuff.tar.gz\"; attachment;") == null); + try std.testing.expect(fromContentDisposition("FileName=\"stuff.tar.gz\";") == null); + } +}; + +fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource { + const gpa = f.arena.child_allocator; + const arena = f.arena.allocator(); + const eb = &f.error_bundle; + + if (ascii.eqlIgnoreCase(uri.scheme, "file")) return .{ + .file = f.parent_package_root.openFile(uri.path, .{}) catch |err| { + return f.fail(f.location_tok, try eb.printString("unable to open '{}{s}': {s}", .{ + f.parent_package_root, uri.path, @errorName(err), + })); + }, + }; + + const http_client = f.job_queue.http_client; + + if (ascii.eqlIgnoreCase(uri.scheme, "http") or + ascii.eqlIgnoreCase(uri.scheme, "https")) + { + var h = std.http.Headers{ .allocator = gpa }; + defer h.deinit(); + + var req = http_client.request(.GET, uri, h, .{}) catch |err| { + return f.fail(f.location_tok, try eb.printString( + "unable to connect to server: {s}", + .{@errorName(err)}, + )); + }; + errdefer req.deinit(); // releases more than memory + + req.start(.{}) catch |err| { + return f.fail(f.location_tok, try eb.printString( + "HTTP request failed: {s}", + .{@errorName(err)}, + )); + }; + req.wait() catch |err| { + return f.fail(f.location_tok, try eb.printString( + "invalid HTTP response: {s}", + .{@errorName(err)}, + )); + }; + + if (req.response.status != .ok) { + return f.fail(f.location_tok, try eb.printString( + "bad HTTP response code: '{d} {s}'", + .{ @intFromEnum(req.response.status), req.response.status.phrase() orelse "" }, + )); + } + + return .{ .http_request = req }; + } + + if (ascii.eqlIgnoreCase(uri.scheme, "git+http") or + ascii.eqlIgnoreCase(uri.scheme, "git+https")) + { + var transport_uri = uri; + transport_uri.scheme = uri.scheme["git+".len..]; + var redirect_uri: []u8 = undefined; + var session: git.Session = .{ .transport = http_client, .uri = transport_uri }; + session.discoverCapabilities(gpa, &redirect_uri) catch |err| switch (err) { + error.Redirected => { + defer gpa.free(redirect_uri); + return f.fail(f.location_tok, try eb.printString( + "repository moved to {s}", + .{redirect_uri}, + )); + }, + else => |e| { + return f.fail(f.location_tok, try eb.printString( + "unable to discover remote git server capabilities: {s}", + .{@errorName(e)}, + )); + }, + }; + + const want_oid = want_oid: { + const want_ref = uri.fragment orelse "HEAD"; + if (git.parseOid(want_ref)) |oid| break :want_oid oid else |_| {} + + const want_ref_head = try std.fmt.allocPrint(arena, "refs/heads/{s}", .{want_ref}); + const want_ref_tag = try std.fmt.allocPrint(arena, "refs/tags/{s}", .{want_ref}); + + var ref_iterator = session.listRefs(gpa, .{ + .ref_prefixes = &.{ want_ref, want_ref_head, want_ref_tag }, + .include_peeled = true, + }) catch |err| { + return f.fail(f.location_tok, try eb.printString( + "unable to list refs: {s}", + .{@errorName(err)}, + )); + }; + defer ref_iterator.deinit(); + while (ref_iterator.next() catch |err| { + return f.fail(f.location_tok, try eb.printString( + "unable to iterate refs: {s}", + .{@errorName(err)}, + )); + }) |ref| { + if (std.mem.eql(u8, ref.name, want_ref) or + std.mem.eql(u8, ref.name, want_ref_head) or + std.mem.eql(u8, ref.name, want_ref_tag)) + { + break :want_oid ref.peeled orelse ref.oid; + } + } + return f.fail(f.location_tok, try eb.printString("ref not found: {s}", .{want_ref})); + }; + if (uri.fragment == null) { + const notes_len = 1; + try eb.addRootErrorMessage(.{ + .msg = try eb.addString("url field is missing an explicit ref"), + .src_loc = try f.srcLoc(f.location_tok), + .notes_len = notes_len, + }); + const notes_start = try eb.reserveNotes(notes_len); + eb.extra.items[notes_start] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("try .url = \"{+/}#{}\",", .{ + uri, std.fmt.fmtSliceHexLower(&want_oid), + }), + })); + return error.FetchFailed; + } + + var want_oid_buf: [git.fmt_oid_length]u8 = undefined; + _ = std.fmt.bufPrint(&want_oid_buf, "{}", .{ + std.fmt.fmtSliceHexLower(&want_oid), + }) catch unreachable; + var fetch_stream = session.fetch(gpa, &.{&want_oid_buf}) catch |err| { + return f.fail(f.location_tok, try eb.printString( + "unable to create fetch stream: {s}", + .{@errorName(err)}, + )); + }; + errdefer fetch_stream.deinit(); + + return .{ .git = .{ + .fetch_stream = fetch_stream, + .want_oid = want_oid, + } }; + } + + return f.fail(f.location_tok, try eb.printString( + "unsupported URL scheme: {s}", + .{uri.scheme}, + )); +} + +fn unpackResource( + f: *Fetch, + resource: *Resource, + uri_path: []const u8, + tmp_directory: Cache.Directory, +) RunError!void { + const eb = &f.error_bundle; + const file_type = switch (resource.*) { + .file => FileType.fromPath(uri_path) orelse + return f.fail(f.location_tok, try eb.printString("unknown file type: '{s}'", .{uri_path})), + + .http_request => |req| ft: { + // Content-Type takes first precedence. + const content_type = req.response.headers.getFirstValue("Content-Type") orelse + return f.fail(f.location_tok, try eb.addString("missing 'Content-Type' header")); + + if (ascii.eqlIgnoreCase(content_type, "application/x-tar")) + break :ft .tar; + + if (ascii.eqlIgnoreCase(content_type, "application/gzip") or + ascii.eqlIgnoreCase(content_type, "application/x-gzip") or + ascii.eqlIgnoreCase(content_type, "application/tar+gzip")) + { + break :ft .@"tar.gz"; + } + + if (ascii.eqlIgnoreCase(content_type, "application/x-xz")) + break :ft .@"tar.xz"; + + if (!ascii.eqlIgnoreCase(content_type, "application/octet-stream")) { + return f.fail(f.location_tok, try eb.printString( + "unrecognized 'Content-Type' header: '{s}'", + .{content_type}, + )); + } + + // Next, the filename from 'content-disposition: attachment' takes precedence. + if (req.response.headers.getFirstValue("Content-Disposition")) |cd_header| { + break :ft FileType.fromContentDisposition(cd_header) orelse { + return f.fail(f.location_tok, try eb.printString( + "unsupported Content-Disposition header value: '{s}' for Content-Type=application/octet-stream", + .{cd_header}, + )); + }; + } + + // Finally, the path from the URI is used. + break :ft FileType.fromPath(uri_path) orelse { + return f.fail(f.location_tok, try eb.printString( + "unknown file type: '{s}'", + .{uri_path}, + )); + }; + }, + + .git => .git_pack, + + .dir => |dir| return f.recursiveDirectoryCopy(dir, tmp_directory.handle) catch |err| { + return f.fail(f.location_tok, try eb.printString( + "unable to copy directory '{s}': {s}", + .{ uri_path, @errorName(err) }, + )); + }, + }; + + switch (file_type) { + .tar => try unpackTarball(f, tmp_directory.handle, resource.reader()), + .@"tar.gz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.gzip), + .@"tar.xz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.xz), + .git_pack => unpackGitPack(f, tmp_directory.handle, resource) catch |err| switch (err) { + error.FetchFailed => return error.FetchFailed, + error.OutOfMemory => return error.OutOfMemory, + else => |e| return f.fail(f.location_tok, try eb.printString( + "unable to unpack git files: {s}", + .{@errorName(e)}, + )), + }, + } +} + +fn unpackTarballCompressed( + f: *Fetch, + out_dir: fs.Dir, + resource: *Resource, + comptime Compression: type, +) RunError!void { + const gpa = f.arena.child_allocator; + const eb = &f.error_bundle; + const reader = resource.reader(); + var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, reader); + + var decompress = Compression.decompress(gpa, br.reader()) catch |err| { + return f.fail(f.location_tok, try eb.printString( + "unable to decompress tarball: {s}", + .{@errorName(err)}, + )); + }; + defer decompress.deinit(); + + return unpackTarball(f, out_dir, decompress.reader()); +} + +fn unpackTarball(f: *Fetch, out_dir: fs.Dir, reader: anytype) RunError!void { + const eb = &f.error_bundle; + const gpa = f.arena.child_allocator; + + var diagnostics: std.tar.Options.Diagnostics = .{ .allocator = gpa }; + defer diagnostics.deinit(); + + std.tar.pipeToFileSystem(out_dir, reader, .{ + .diagnostics = &diagnostics, + .strip_components = 1, + // TODO: we would like to set this to executable_bit_only, but two + // things need to happen before that: + // 1. the tar implementation needs to support it + // 2. the hashing algorithm here needs to support detecting the is_executable + // bit on Windows from the ACLs (see the isExecutable function). + .mode_mode = .ignore, + .exclude_empty_directories = true, + }) catch |err| return f.fail(f.location_tok, try eb.printString( + "unable to unpack tarball to temporary directory: {s}", + .{@errorName(err)}, + )); + + if (diagnostics.errors.items.len > 0) { + const notes_len: u32 = @intCast(diagnostics.errors.items.len); + try eb.addRootErrorMessage(.{ + .msg = try eb.addString("unable to unpack tarball"), + .src_loc = try f.srcLoc(f.location_tok), + .notes_len = notes_len, + }); + const notes_start = try eb.reserveNotes(notes_len); + for (diagnostics.errors.items, notes_start..) |item, note_i| { + switch (item) { + .unable_to_create_sym_link => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("unable to create symlink from '{s}' to '{s}': {s}", .{ + info.file_name, info.link_name, @errorName(info.code), + }), + })); + }, + .unable_to_create_file => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("unable to create file '{s}': {s}", .{ + info.file_name, @errorName(info.code), + }), + })); + }, + .unsupported_file_type => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("file '{s}' has unsupported type '{c}'", .{ + info.file_name, @intFromEnum(info.file_type), + }), + })); + }, + } + } + return error.FetchFailed; + } +} + +fn unpackGitPack(f: *Fetch, out_dir: fs.Dir, resource: *Resource) anyerror!void { + const eb = &f.error_bundle; + const gpa = f.arena.child_allocator; + const want_oid = resource.git.want_oid; + const reader = resource.git.fetch_stream.reader(); + // The .git directory is used to store the packfile and associated index, but + // we do not attempt to replicate the exact structure of a real .git + // directory, since that isn't relevant for fetching a package. + { + var pack_dir = try out_dir.makeOpenPath(".git", .{}); + defer pack_dir.close(); + var pack_file = try pack_dir.createFile("pkg.pack", .{ .read = true }); + defer pack_file.close(); + var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); + try fifo.pump(reader, pack_file.writer()); + try pack_file.sync(); + + var index_file = try pack_dir.createFile("pkg.idx", .{ .read = true }); + defer index_file.close(); + { + var index_prog_node = f.prog_node.start("Index pack", 0); + defer index_prog_node.end(); + index_prog_node.activate(); + var index_buffered_writer = std.io.bufferedWriter(index_file.writer()); + try git.indexPack(gpa, pack_file, index_buffered_writer.writer()); + try index_buffered_writer.flush(); + try index_file.sync(); + } + + { + var checkout_prog_node = f.prog_node.start("Checkout", 0); + defer checkout_prog_node.end(); + checkout_prog_node.activate(); + var repository = try git.Repository.init(gpa, pack_file, index_file); + defer repository.deinit(); + var diagnostics: git.Diagnostics = .{ .allocator = gpa }; + defer diagnostics.deinit(); + try repository.checkout(out_dir, want_oid, &diagnostics); + + if (diagnostics.errors.items.len > 0) { + const notes_len: u32 = @intCast(diagnostics.errors.items.len); + try eb.addRootErrorMessage(.{ + .msg = try eb.addString("unable to unpack packfile"), + .src_loc = try f.srcLoc(f.location_tok), + .notes_len = notes_len, + }); + const notes_start = try eb.reserveNotes(notes_len); + for (diagnostics.errors.items, notes_start..) |item, note_i| { + switch (item) { + .unable_to_create_sym_link => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("unable to create symlink from '{s}' to '{s}': {s}", .{ + info.file_name, info.link_name, @errorName(info.code), + }), + })); + }, + } + } + return error.InvalidGitPack; + } + } + } + + try out_dir.deleteTree(".git"); +} + +fn recursiveDirectoryCopy(f: *Fetch, dir: fs.IterableDir, tmp_dir: fs.Dir) anyerror!void { + const gpa = f.arena.child_allocator; + // Recursive directory copy. + var it = try dir.walk(gpa); + defer it.deinit(); + while (try it.next()) |entry| { + switch (entry.kind) { + .directory => {}, // omit empty directories + .file => { + dir.dir.copyFile( + entry.path, + tmp_dir, + entry.path, + .{}, + ) catch |err| switch (err) { + error.FileNotFound => { + if (fs.path.dirname(entry.path)) |dirname| try tmp_dir.makePath(dirname); + try dir.dir.copyFile(entry.path, tmp_dir, entry.path, .{}); + }, + else => |e| return e, + }; + }, + .sym_link => { + var buf: [fs.MAX_PATH_BYTES]u8 = undefined; + const link_name = try dir.dir.readLink(entry.path, &buf); + // TODO: if this would create a symlink to outside + // the destination directory, fail with an error instead. + tmp_dir.symLink(link_name, entry.path, .{}) catch |err| switch (err) { + error.FileNotFound => { + if (fs.path.dirname(entry.path)) |dirname| try tmp_dir.makePath(dirname); + try tmp_dir.symLink(link_name, entry.path, .{}); + }, + else => |e| return e, + }; + }, + else => return error.IllegalFileTypeInPackage, + } + } +} + +pub fn renameTmpIntoCache( + cache_dir: fs.Dir, + tmp_dir_sub_path: []const u8, + dest_dir_sub_path: []const u8, +) !void { + assert(dest_dir_sub_path[1] == fs.path.sep); + var handled_missing_dir = false; + while (true) { + cache_dir.rename(tmp_dir_sub_path, dest_dir_sub_path) catch |err| switch (err) { + error.FileNotFound => { + if (handled_missing_dir) return err; + cache_dir.makeDir(dest_dir_sub_path[0..1]) catch |mkd_err| switch (mkd_err) { + error.PathAlreadyExists => handled_missing_dir = true, + else => |e| return e, + }; + continue; + }, + error.PathAlreadyExists, error.AccessDenied => { + // Package has been already downloaded and may already be in use on the system. + cache_dir.deleteTree(tmp_dir_sub_path) catch { + // Garbage files leftover in zig-cache/tmp/ is, as they say + // on Star Trek, "operating within normal parameters". + }; + }, + else => |e| return e, + }; + break; + } +} + +/// Assumes that files not included in the package have already been filtered +/// prior to calling this function. This ensures that files not protected by +/// the hash are not present on the file system. Empty directories are *not +/// hashed* and must not be present on the file system when calling this +/// function. +fn computeHash( + f: *Fetch, + tmp_directory: Cache.Directory, + filter: Filter, +) RunError!Manifest.Digest { + // All the path name strings need to be in memory for sorting. + const arena = f.arena.allocator(); + const gpa = f.arena.child_allocator; + const eb = &f.error_bundle; + const thread_pool = f.job_queue.thread_pool; + + // Collect all files, recursively, then sort. + var all_files = std.ArrayList(*HashedFile).init(gpa); + defer all_files.deinit(); + + var deleted_files = std.ArrayList(*DeletedFile).init(gpa); + defer deleted_files.deinit(); + + // Track directories which had any files deleted from them so that empty directories + // can be deleted. + var sus_dirs: std.StringArrayHashMapUnmanaged(void) = .{}; + defer sus_dirs.deinit(gpa); + + var walker = try @as(fs.IterableDir, .{ .dir = tmp_directory.handle }).walk(gpa); + defer walker.deinit(); + + { + // The final hash will be a hash of each file hashed independently. This + // allows hashing in parallel. + var wait_group: WaitGroup = .{}; + // `computeHash` is called from a worker thread so there must not be + // any waiting without working or a deadlock could occur. + defer thread_pool.waitAndWork(&wait_group); + + while (walker.next() catch |err| { + try eb.addRootErrorMessage(.{ .msg = try eb.printString( + "unable to walk temporary directory '{}': {s}", + .{ tmp_directory, @errorName(err) }, + ) }); + return error.FetchFailed; + }) |entry| { + if (entry.kind == .directory) continue; + + if (!filter.includePath(entry.path)) { + // Delete instead of including in hash calculation. + const fs_path = try arena.dupe(u8, entry.path); + + // Also track the parent directory in case it becomes empty. + if (fs.path.dirname(fs_path)) |parent| + try sus_dirs.put(gpa, parent, {}); + + const deleted_file = try arena.create(DeletedFile); + deleted_file.* = .{ + .fs_path = fs_path, + .failure = undefined, // to be populated by the worker + }; + wait_group.start(); + try thread_pool.spawn(workerDeleteFile, .{ + tmp_directory.handle, deleted_file, &wait_group, + }); + try deleted_files.append(deleted_file); + continue; + } + + const kind: HashedFile.Kind = switch (entry.kind) { + .directory => unreachable, + .file => .file, + .sym_link => .sym_link, + else => return f.fail(f.location_tok, try eb.printString( + "package contains '{s}' which has illegal file type '{s}'", + .{ entry.path, @tagName(entry.kind) }, + )), + }; + + if (std.mem.eql(u8, entry.path, Package.build_zig_basename)) + f.has_build_zig = true; + + const fs_path = try arena.dupe(u8, entry.path); + const hashed_file = try arena.create(HashedFile); + hashed_file.* = .{ + .fs_path = fs_path, + .normalized_path = try normalizePath(arena, fs_path), + .kind = kind, + .hash = undefined, // to be populated by the worker + .failure = undefined, // to be populated by the worker + }; + wait_group.start(); + try thread_pool.spawn(workerHashFile, .{ + tmp_directory.handle, hashed_file, &wait_group, + }); + try all_files.append(hashed_file); + } + } + + { + // Sort by length, descending, so that child directories get removed first. + sus_dirs.sortUnstable(@as(struct { + keys: []const []const u8, + pub fn lessThan(ctx: @This(), a_index: usize, b_index: usize) bool { + return ctx.keys[b_index].len < ctx.keys[a_index].len; + } + }, .{ .keys = sus_dirs.keys() })); + + // During this loop, more entries will be added, so we must loop by index. + var i: usize = 0; + while (i < sus_dirs.count()) : (i += 1) { + const sus_dir = sus_dirs.keys()[i]; + tmp_directory.handle.deleteDir(sus_dir) catch |err| switch (err) { + error.DirNotEmpty => continue, + error.FileNotFound => continue, + else => |e| { + try eb.addRootErrorMessage(.{ .msg = try eb.printString( + "unable to delete empty directory '{s}': {s}", + .{ sus_dir, @errorName(e) }, + ) }); + return error.FetchFailed; + }, + }; + if (fs.path.dirname(sus_dir)) |parent| { + try sus_dirs.put(gpa, parent, {}); + } + } + } + + std.mem.sortUnstable(*HashedFile, all_files.items, {}, HashedFile.lessThan); + + var hasher = Manifest.Hash.init(.{}); + var any_failures = false; + for (all_files.items) |hashed_file| { + hashed_file.failure catch |err| { + any_failures = true; + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to hash '{s}': {s}", .{ + hashed_file.fs_path, @errorName(err), + }), + }); + }; + hasher.update(&hashed_file.hash); + } + for (deleted_files.items) |deleted_file| { + deleted_file.failure catch |err| { + any_failures = true; + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("failed to delete excluded path '{s}' from package: {s}", .{ + deleted_file.fs_path, @errorName(err), + }), + }); + }; + } + + if (any_failures) return error.FetchFailed; + return hasher.finalResult(); +} + +fn workerHashFile(dir: fs.Dir, hashed_file: *HashedFile, wg: *WaitGroup) void { + defer wg.finish(); + hashed_file.failure = hashFileFallible(dir, hashed_file); +} + +fn workerDeleteFile(dir: fs.Dir, deleted_file: *DeletedFile, wg: *WaitGroup) void { + defer wg.finish(); + deleted_file.failure = deleteFileFallible(dir, deleted_file); +} + +fn hashFileFallible(dir: fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void { + var buf: [8000]u8 = undefined; + var hasher = Manifest.Hash.init(.{}); + hasher.update(hashed_file.normalized_path); + switch (hashed_file.kind) { + .file => { + var file = try dir.openFile(hashed_file.fs_path, .{}); + defer file.close(); + hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) }); + while (true) { + const bytes_read = try file.read(&buf); + if (bytes_read == 0) break; + hasher.update(buf[0..bytes_read]); + } + }, + .sym_link => { + const link_name = try dir.readLink(hashed_file.fs_path, &buf); + hasher.update(link_name); + }, + } + hasher.final(&hashed_file.hash); +} + +fn deleteFileFallible(dir: fs.Dir, deleted_file: *DeletedFile) DeletedFile.Error!void { + try dir.deleteFile(deleted_file.fs_path); +} + +fn isExecutable(file: fs.File) !bool { + if (builtin.os.tag == .windows) { + // TODO check the ACL on Windows. + // Until this is implemented, this could be a false negative on + // Windows, which is why we do not yet set executable_bit_only above + // when unpacking the tarball. + return false; + } else { + const stat = try file.stat(); + return (stat.mode & std.os.S.IXUSR) != 0; + } +} + +const DeletedFile = struct { + fs_path: []const u8, + failure: Error!void, + + const Error = + fs.Dir.DeleteFileError || + fs.Dir.DeleteDirError; +}; + +const HashedFile = struct { + fs_path: []const u8, + normalized_path: []const u8, + hash: Manifest.Digest, + failure: Error!void, + kind: Kind, + + const Error = + fs.File.OpenError || + fs.File.ReadError || + fs.File.StatError || + fs.Dir.ReadLinkError; + + const Kind = enum { file, sym_link }; + + fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool { + _ = context; + return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path); + } +}; + +/// Make a file system path identical independently of operating system path inconsistencies. +/// This converts backslashes into forward slashes. +fn normalizePath(arena: Allocator, fs_path: []const u8) ![]const u8 { + const canonical_sep = '/'; + + if (fs.path.sep == canonical_sep) + return fs_path; + + const normalized = try arena.dupe(u8, fs_path); + for (normalized) |*byte| { + switch (byte.*) { + fs.path.sep => byte.* = canonical_sep, + else => continue, + } + } + return normalized; +} + +const Filter = struct { + include_paths: std.StringArrayHashMapUnmanaged(void) = .{}, + + /// sub_path is relative to the package root. + pub fn includePath(self: Filter, sub_path: []const u8) bool { + if (self.include_paths.count() == 0) return true; + if (self.include_paths.contains("")) return true; + if (self.include_paths.contains(sub_path)) return true; + + // Check if any included paths are parent directories of sub_path. + var dirname = sub_path; + while (std.fs.path.dirname(dirname)) |next_dirname| { + if (self.include_paths.contains(sub_path)) return true; + dirname = next_dirname; + } + + return false; + } +}; + +pub fn depDigest( + pkg_root: Package.Path, + cache_root: Cache.Directory, + dep: Manifest.Dependency, +) ?Manifest.MultiHashHexDigest { + if (dep.hash) |h| return h[0..Manifest.multihash_hex_digest_len].*; + + switch (dep.location) { + .url => return null, + .path => |rel_path| { + var buf: [fs.MAX_PATH_BYTES]u8 = undefined; + var fba = std.heap.FixedBufferAllocator.init(&buf); + const new_root = pkg_root.resolvePosix(fba.allocator(), rel_path) catch + return null; + return relativePathDigest(new_root, cache_root); + }, + } +} + +// These are random bytes. +const package_hash_prefix_cached = [8]u8{ 0x53, 0x7e, 0xfa, 0x94, 0x65, 0xe9, 0xf8, 0x73 }; +const package_hash_prefix_project = [8]u8{ 0xe1, 0x25, 0xee, 0xfa, 0xa6, 0x17, 0x38, 0xcc }; + +const builtin = @import("builtin"); +const std = @import("std"); +const fs = std.fs; +const assert = std.debug.assert; +const ascii = std.ascii; +const Allocator = std.mem.Allocator; +const Cache = std.Build.Cache; +const ThreadPool = std.Thread.Pool; +const WaitGroup = std.Thread.WaitGroup; +const Fetch = @This(); +const main = @import("../main.zig"); +const git = @import("Fetch/git.zig"); +const Package = @import("../Package.zig"); +const Manifest = Package.Manifest; +const ErrorBundle = std.zig.ErrorBundle; diff --git a/src/Package/Fetch/git.zig b/src/Package/Fetch/git.zig new file mode 100644 index 0000000000..af4317702d --- /dev/null +++ b/src/Package/Fetch/git.zig @@ -0,0 +1,1466 @@ +//! Git support for package fetching. +//! +//! This is not intended to support all features of Git: it is limited to the +//! basic functionality needed to clone a repository for the purpose of fetching +//! a package. + +const std = @import("std"); +const mem = std.mem; +const testing = std.testing; +const Allocator = mem.Allocator; +const Sha1 = std.crypto.hash.Sha1; +const assert = std.debug.assert; + +pub const oid_length = Sha1.digest_length; +pub const fmt_oid_length = 2 * oid_length; +/// The ID of a Git object (an SHA-1 hash). +pub const Oid = [oid_length]u8; + +pub fn parseOid(s: []const u8) !Oid { + if (s.len != fmt_oid_length) return error.InvalidOid; + var oid: Oid = undefined; + for (&oid, 0..) |*b, i| { + b.* = std.fmt.parseUnsigned(u8, s[2 * i ..][0..2], 16) catch return error.InvalidOid; + } + return oid; +} + +test parseOid { + try testing.expectEqualSlices( + u8, + &.{ 0xCE, 0x91, 0x9C, 0xCF, 0x45, 0x95, 0x18, 0x56, 0xA7, 0x62, 0xFF, 0xDB, 0x8E, 0xF8, 0x50, 0x30, 0x1C, 0xD8, 0xC5, 0x88 }, + &try parseOid("ce919ccf45951856a762ffdb8ef850301cd8c588"), + ); + try testing.expectError(error.InvalidOid, parseOid("ce919ccf")); + try testing.expectError(error.InvalidOid, parseOid("master")); + try testing.expectError(error.InvalidOid, parseOid("HEAD")); +} + +pub const Diagnostics = struct { + allocator: Allocator, + errors: std.ArrayListUnmanaged(Error) = .{}, + + pub const Error = union(enum) { + unable_to_create_sym_link: struct { + code: anyerror, + file_name: []const u8, + link_name: []const u8, + }, + }; + + pub fn deinit(d: *Diagnostics) void { + for (d.errors.items) |item| { + switch (item) { + .unable_to_create_sym_link => |info| { + d.allocator.free(info.file_name); + d.allocator.free(info.link_name); + }, + } + } + d.errors.deinit(d.allocator); + d.* = undefined; + } +}; + +pub const Repository = struct { + odb: Odb, + + pub fn init(allocator: Allocator, pack_file: std.fs.File, index_file: std.fs.File) !Repository { + return .{ .odb = try Odb.init(allocator, pack_file, index_file) }; + } + + pub fn deinit(repository: *Repository) void { + repository.odb.deinit(); + repository.* = undefined; + } + + /// Checks out the repository at `commit_oid` to `worktree`. + pub fn checkout( + repository: *Repository, + worktree: std.fs.Dir, + commit_oid: Oid, + diagnostics: *Diagnostics, + ) !void { + try repository.odb.seekOid(commit_oid); + const tree_oid = tree_oid: { + var commit_object = try repository.odb.readObject(); + if (commit_object.type != .commit) return error.NotACommit; + break :tree_oid try getCommitTree(commit_object.data); + }; + try repository.checkoutTree(worktree, tree_oid, "", diagnostics); + } + + /// Checks out the tree at `tree_oid` to `worktree`. + fn checkoutTree( + repository: *Repository, + dir: std.fs.Dir, + tree_oid: Oid, + current_path: []const u8, + diagnostics: *Diagnostics, + ) !void { + try repository.odb.seekOid(tree_oid); + const tree_object = try repository.odb.readObject(); + if (tree_object.type != .tree) return error.NotATree; + // The tree object may be evicted from the object cache while we're + // iterating over it, so we can make a defensive copy here to make sure + // it remains valid until we're done with it + const tree_data = try repository.odb.allocator.dupe(u8, tree_object.data); + defer repository.odb.allocator.free(tree_data); + + var tree_iter: TreeIterator = .{ .data = tree_data }; + while (try tree_iter.next()) |entry| { + switch (entry.type) { + .directory => { + try dir.makeDir(entry.name); + var subdir = try dir.openDir(entry.name, .{}); + defer subdir.close(); + const sub_path = try std.fs.path.join(repository.odb.allocator, &.{ current_path, entry.name }); + defer repository.odb.allocator.free(sub_path); + try repository.checkoutTree(subdir, entry.oid, sub_path, diagnostics); + }, + .file => { + var file = try dir.createFile(entry.name, .{}); + defer file.close(); + try repository.odb.seekOid(entry.oid); + var file_object = try repository.odb.readObject(); + if (file_object.type != .blob) return error.InvalidFile; + try file.writeAll(file_object.data); + try file.sync(); + }, + .symlink => { + try repository.odb.seekOid(entry.oid); + var symlink_object = try repository.odb.readObject(); + if (symlink_object.type != .blob) return error.InvalidFile; + const link_name = symlink_object.data; + dir.symLink(link_name, entry.name, .{}) catch |e| { + const file_name = try std.fs.path.join(diagnostics.allocator, &.{ current_path, entry.name }); + errdefer diagnostics.allocator.free(file_name); + const link_name_dup = try diagnostics.allocator.dupe(u8, link_name); + errdefer diagnostics.allocator.free(link_name_dup); + try diagnostics.errors.append(diagnostics.allocator, .{ .unable_to_create_sym_link = .{ + .code = e, + .file_name = file_name, + .link_name = link_name_dup, + } }); + }; + }, + .gitlink => { + // Consistent with git archive behavior, create the directory but + // do nothing else + try dir.makeDir(entry.name); + }, + } + } + } + + /// Returns the ID of the tree associated with the given commit (provided as + /// raw object data). + fn getCommitTree(commit_data: []const u8) !Oid { + if (!mem.startsWith(u8, commit_data, "tree ") or + commit_data.len < "tree ".len + fmt_oid_length + "\n".len or + commit_data["tree ".len + fmt_oid_length] != '\n') + { + return error.InvalidCommit; + } + return try parseOid(commit_data["tree ".len..][0..fmt_oid_length]); + } + + const TreeIterator = struct { + data: []const u8, + pos: usize = 0, + + const Entry = struct { + type: Type, + executable: bool, + name: [:0]const u8, + oid: Oid, + + const Type = enum(u4) { + directory = 0o4, + file = 0o10, + symlink = 0o12, + gitlink = 0o16, + }; + }; + + fn next(iterator: *TreeIterator) !?Entry { + if (iterator.pos == iterator.data.len) return null; + + const mode_end = mem.indexOfScalarPos(u8, iterator.data, iterator.pos, ' ') orelse return error.InvalidTree; + const mode: packed struct { + permission: u9, + unused: u3, + type: u4, + } = @bitCast(std.fmt.parseUnsigned(u16, iterator.data[iterator.pos..mode_end], 8) catch return error.InvalidTree); + const @"type" = std.meta.intToEnum(Entry.Type, mode.type) catch return error.InvalidTree; + const executable = switch (mode.permission) { + 0 => if (@"type" == .file) return error.InvalidTree else false, + 0o644 => if (@"type" != .file) return error.InvalidTree else false, + 0o755 => if (@"type" != .file) return error.InvalidTree else true, + else => return error.InvalidTree, + }; + iterator.pos = mode_end + 1; + + const name_end = mem.indexOfScalarPos(u8, iterator.data, iterator.pos, 0) orelse return error.InvalidTree; + const name = iterator.data[iterator.pos..name_end :0]; + iterator.pos = name_end + 1; + + if (iterator.pos + oid_length > iterator.data.len) return error.InvalidTree; + const oid = iterator.data[iterator.pos..][0..oid_length].*; + iterator.pos += oid_length; + + return .{ .type = @"type", .executable = executable, .name = name, .oid = oid }; + } + }; +}; + +/// A Git object database backed by a packfile. A packfile index is also used +/// for efficient access to objects in the packfile. +/// +/// The format of the packfile and its associated index are documented in +/// [pack-format](https://git-scm.com/docs/pack-format). +const Odb = struct { + pack_file: std.fs.File, + index_header: IndexHeader, + index_file: std.fs.File, + cache: ObjectCache = .{}, + allocator: Allocator, + + /// Initializes the database from open pack and index files. + fn init(allocator: Allocator, pack_file: std.fs.File, index_file: std.fs.File) !Odb { + try pack_file.seekTo(0); + try index_file.seekTo(0); + const index_header = try IndexHeader.read(index_file.reader()); + return .{ + .pack_file = pack_file, + .index_header = index_header, + .index_file = index_file, + .allocator = allocator, + }; + } + + fn deinit(odb: *Odb) void { + odb.cache.deinit(odb.allocator); + odb.* = undefined; + } + + /// Reads the object at the current position in the database. + fn readObject(odb: *Odb) !Object { + var base_offset = try odb.pack_file.getPos(); + var base_header: EntryHeader = undefined; + var delta_offsets = std.ArrayListUnmanaged(u64){}; + defer delta_offsets.deinit(odb.allocator); + const base_object = while (true) { + if (odb.cache.get(base_offset)) |base_object| break base_object; + + base_header = try EntryHeader.read(odb.pack_file.reader()); + switch (base_header) { + .ofs_delta => |ofs_delta| { + try delta_offsets.append(odb.allocator, base_offset); + base_offset = std.math.sub(u64, base_offset, ofs_delta.offset) catch return error.InvalidFormat; + try odb.pack_file.seekTo(base_offset); + }, + .ref_delta => |ref_delta| { + try delta_offsets.append(odb.allocator, base_offset); + try odb.seekOid(ref_delta.base_object); + base_offset = try odb.pack_file.getPos(); + }, + else => { + const base_data = try readObjectRaw(odb.allocator, odb.pack_file.reader(), base_header.uncompressedLength()); + errdefer odb.allocator.free(base_data); + const base_object: Object = .{ .type = base_header.objectType(), .data = base_data }; + try odb.cache.put(odb.allocator, base_offset, base_object); + break base_object; + }, + } + }; + + const base_data = try resolveDeltaChain( + odb.allocator, + odb.pack_file, + base_object, + delta_offsets.items, + &odb.cache, + ); + + return .{ .type = base_object.type, .data = base_data }; + } + + /// Seeks to the beginning of the object with the given ID. + fn seekOid(odb: *Odb, oid: Oid) !void { + const key = oid[0]; + var start_index = if (key > 0) odb.index_header.fan_out_table[key - 1] else 0; + var end_index = odb.index_header.fan_out_table[key]; + const found_index = while (start_index < end_index) { + const mid_index = start_index + (end_index - start_index) / 2; + try odb.index_file.seekTo(IndexHeader.size + mid_index * oid_length); + const mid_oid = try odb.index_file.reader().readBytesNoEof(oid_length); + switch (mem.order(u8, &mid_oid, &oid)) { + .lt => start_index = mid_index + 1, + .gt => end_index = mid_index, + .eq => break mid_index, + } + } else return error.ObjectNotFound; + + const n_objects = odb.index_header.fan_out_table[255]; + const offset_values_start = IndexHeader.size + n_objects * (oid_length + 4); + try odb.index_file.seekTo(offset_values_start + found_index * 4); + const l1_offset: packed struct { value: u31, big: bool } = @bitCast(try odb.index_file.reader().readIntBig(u32)); + const pack_offset = pack_offset: { + if (l1_offset.big) { + const l2_offset_values_start = offset_values_start + n_objects * 4; + try odb.index_file.seekTo(l2_offset_values_start + l1_offset.value * 4); + break :pack_offset try odb.index_file.reader().readIntBig(u64); + } else { + break :pack_offset l1_offset.value; + } + }; + + try odb.pack_file.seekTo(pack_offset); + } +}; + +const Object = struct { + type: Type, + data: []const u8, + + const Type = enum { + commit, + tree, + blob, + tag, + }; +}; + +/// A cache for object data. +/// +/// The purpose of this cache is to speed up resolution of deltas by caching the +/// results of resolving delta objects, while maintaining a maximum cache size +/// to avoid excessive memory usage. If the total size of the objects in the +/// cache exceeds the maximum, the cache will begin evicting the least recently +/// used objects: when resolving delta chains, the most recently used objects +/// will likely be more helpful as they will be further along in the chain +/// (skipping earlier reconstruction steps). +/// +/// Object data stored in the cache is managed by the cache. It should not be +/// freed by the caller at any point after inserting it into the cache. Any +/// objects remaining in the cache will be freed when the cache itself is freed. +const ObjectCache = struct { + objects: std.AutoHashMapUnmanaged(u64, CacheEntry) = .{}, + lru_nodes: LruList = .{}, + byte_size: usize = 0, + + const max_byte_size = 128 * 1024 * 1024; // 128MiB + /// A list of offsets stored in the cache, with the most recently used + /// entries at the end. + const LruList = std.DoublyLinkedList(u64); + const CacheEntry = struct { object: Object, lru_node: *LruList.Node }; + + fn deinit(cache: *ObjectCache, allocator: Allocator) void { + var object_iterator = cache.objects.iterator(); + while (object_iterator.next()) |object| { + allocator.free(object.value_ptr.object.data); + allocator.destroy(object.value_ptr.lru_node); + } + cache.objects.deinit(allocator); + cache.* = undefined; + } + + /// Gets an object from the cache, moving it to the most recently used + /// position if it is present. + fn get(cache: *ObjectCache, offset: u64) ?Object { + if (cache.objects.get(offset)) |entry| { + cache.lru_nodes.remove(entry.lru_node); + cache.lru_nodes.append(entry.lru_node); + return entry.object; + } else { + return null; + } + } + + /// Puts an object in the cache, possibly evicting older entries if the + /// cache exceeds its maximum size. Note that, although old objects may + /// be evicted, the object just added to the cache with this function + /// will not be evicted before the next call to `put` or `deinit` even if + /// it exceeds the maximum cache size. + fn put(cache: *ObjectCache, allocator: Allocator, offset: u64, object: Object) !void { + const lru_node = try allocator.create(LruList.Node); + errdefer allocator.destroy(lru_node); + lru_node.data = offset; + + const gop = try cache.objects.getOrPut(allocator, offset); + if (gop.found_existing) { + cache.byte_size -= gop.value_ptr.object.data.len; + cache.lru_nodes.remove(gop.value_ptr.lru_node); + allocator.destroy(gop.value_ptr.lru_node); + allocator.free(gop.value_ptr.object.data); + } + gop.value_ptr.* = .{ .object = object, .lru_node = lru_node }; + cache.byte_size += object.data.len; + cache.lru_nodes.append(lru_node); + + while (cache.byte_size > max_byte_size and cache.lru_nodes.len > 1) { + // The > 1 check is to make sure that we don't evict the most + // recently added node, even if it by itself happens to exceed the + // maximum size of the cache. + const evict_node = cache.lru_nodes.popFirst().?; + const evict_offset = evict_node.data; + allocator.destroy(evict_node); + const evict_object = cache.objects.get(evict_offset).?.object; + cache.byte_size -= evict_object.data.len; + allocator.free(evict_object.data); + _ = cache.objects.remove(evict_offset); + } + } +}; + +/// A single pkt-line in the Git protocol. +/// +/// The format of a pkt-line is documented in +/// [protocol-common](https://git-scm.com/docs/protocol-common). The special +/// meanings of the delimiter and response-end packets are documented in +/// [protocol-v2](https://git-scm.com/docs/protocol-v2). +const Packet = union(enum) { + flush, + delimiter, + response_end, + data: []const u8, + + const max_data_length = 65516; + + /// Reads a packet in pkt-line format. + fn read(reader: anytype, buf: *[max_data_length]u8) !Packet { + const length = std.fmt.parseUnsigned(u16, &try reader.readBytesNoEof(4), 16) catch return error.InvalidPacket; + switch (length) { + 0 => return .flush, + 1 => return .delimiter, + 2 => return .response_end, + 3 => return error.InvalidPacket, + else => if (length - 4 > max_data_length) return error.InvalidPacket, + } + const data = buf[0 .. length - 4]; + try reader.readNoEof(data); + return .{ .data = data }; + } + + /// Writes a packet in pkt-line format. + fn write(packet: Packet, writer: anytype) !void { + switch (packet) { + .flush => try writer.writeAll("0000"), + .delimiter => try writer.writeAll("0001"), + .response_end => try writer.writeAll("0002"), + .data => |data| { + assert(data.len <= max_data_length); + try writer.print("{x:0>4}", .{data.len + 4}); + try writer.writeAll(data); + }, + } + } +}; + +/// A client session for the Git protocol, currently limited to an HTTP(S) +/// transport. Only protocol version 2 is supported, as documented in +/// [protocol-v2](https://git-scm.com/docs/protocol-v2). +pub const Session = struct { + transport: *std.http.Client, + uri: std.Uri, + supports_agent: bool = false, + supports_shallow: bool = false, + + const agent = "zig/" ++ @import("builtin").zig_version_string; + const agent_capability = std.fmt.comptimePrint("agent={s}\n", .{agent}); + + /// Discovers server capabilities. This should be called before using any + /// other client functionality, or the client will be forced to default to + /// the bare minimum server requirements, which may be considerably less + /// efficient (e.g. no shallow fetches). + /// + /// See the note on `getCapabilities` regarding `redirect_uri`. + pub fn discoverCapabilities( + session: *Session, + allocator: Allocator, + redirect_uri: *[]u8, + ) !void { + var capability_iterator = try session.getCapabilities(allocator, redirect_uri); + defer capability_iterator.deinit(); + while (try capability_iterator.next()) |capability| { + if (mem.eql(u8, capability.key, "agent")) { + session.supports_agent = true; + } else if (mem.eql(u8, capability.key, "fetch")) { + var feature_iterator = mem.splitScalar(u8, capability.value orelse continue, ' '); + while (feature_iterator.next()) |feature| { + if (mem.eql(u8, feature, "shallow")) { + session.supports_shallow = true; + } + } + } + } + } + + /// Returns an iterator over capabilities supported by the server. + /// + /// If the server redirects the request, `error.Redirected` is returned and + /// `redirect_uri` is populated with the URI resulting from the redirects. + /// When this occurs, the value of `redirect_uri` must be freed with + /// `allocator` when the caller is done with it. + fn getCapabilities( + session: Session, + allocator: Allocator, + redirect_uri: *[]u8, + ) !CapabilityIterator { + var info_refs_uri = session.uri; + info_refs_uri.path = try std.fs.path.resolvePosix(allocator, &.{ "/", session.uri.path, "info/refs" }); + defer allocator.free(info_refs_uri.path); + info_refs_uri.query = "service=git-upload-pack"; + info_refs_uri.fragment = null; + + var headers = std.http.Headers.init(allocator); + defer headers.deinit(); + try headers.append("Git-Protocol", "version=2"); + + var request = try session.transport.request(.GET, info_refs_uri, headers, .{ + .max_redirects = 3, + }); + errdefer request.deinit(); + try request.start(.{}); + try request.finish(); + + try request.wait(); + if (request.response.status != .ok) return error.ProtocolError; + if (request.redirects_left < 3) { + if (!mem.endsWith(u8, request.uri.path, "/info/refs")) return error.UnparseableRedirect; + var new_uri = request.uri; + new_uri.path = new_uri.path[0 .. new_uri.path.len - "/info/refs".len]; + new_uri.query = null; + redirect_uri.* = try std.fmt.allocPrint(allocator, "{+/}", .{new_uri}); + return error.Redirected; + } + + const reader = request.reader(); + var buf: [Packet.max_data_length]u8 = undefined; + var state: enum { response_start, response_content } = .response_start; + while (true) { + // Some Git servers (at least GitHub) include an additional + // '# service=git-upload-pack' informative response before sending + // the expected 'version 2' packet and capability information. + // This is not universal: SourceHut, for example, does not do this. + // Thus, we need to skip any such useless additional responses + // before we get the one we're actually looking for. The responses + // will be delimited by flush packets. + const packet = Packet.read(reader, &buf) catch |e| switch (e) { + error.EndOfStream => return error.UnsupportedProtocol, // 'version 2' packet not found + else => |other| return other, + }; + switch (packet) { + .flush => state = .response_start, + .data => |data| switch (state) { + .response_start => if (mem.eql(u8, data, "version 2\n")) { + return .{ .request = request }; + } else { + state = .response_content; + }, + else => {}, + }, + else => return error.UnexpectedPacket, + } + } + } + + const CapabilityIterator = struct { + request: std.http.Client.Request, + buf: [Packet.max_data_length]u8 = undefined, + + const Capability = struct { + key: []const u8, + value: ?[]const u8 = null, + }; + + fn deinit(iterator: *CapabilityIterator) void { + iterator.request.deinit(); + iterator.* = undefined; + } + + fn next(iterator: *CapabilityIterator) !?Capability { + switch (try Packet.read(iterator.request.reader(), &iterator.buf)) { + .flush => return null, + .data => |data| if (data.len > 0 and data[data.len - 1] == '\n') { + if (mem.indexOfScalar(u8, data, '=')) |separator_pos| { + return .{ .key = data[0..separator_pos], .value = data[separator_pos + 1 .. data.len - 1] }; + } else { + return .{ .key = data[0 .. data.len - 1] }; + } + } else return error.UnexpectedPacket, + else => return error.UnexpectedPacket, + } + } + }; + + const ListRefsOptions = struct { + /// The ref prefixes (if any) to use to filter the refs available on the + /// server. Note that the client must still check the returned refs + /// against its desired filters itself: the server is not required to + /// respect these prefix filters and may return other refs as well. + ref_prefixes: []const []const u8 = &.{}, + /// Whether to include symref targets for returned symbolic refs. + include_symrefs: bool = false, + /// Whether to include the peeled object ID for returned tag refs. + include_peeled: bool = false, + }; + + /// Returns an iterator over refs known to the server. + pub fn listRefs(session: Session, allocator: Allocator, options: ListRefsOptions) !RefIterator { + var upload_pack_uri = session.uri; + upload_pack_uri.path = try std.fs.path.resolvePosix(allocator, &.{ "/", session.uri.path, "git-upload-pack" }); + defer allocator.free(upload_pack_uri.path); + upload_pack_uri.query = null; + upload_pack_uri.fragment = null; + + var headers = std.http.Headers.init(allocator); + defer headers.deinit(); + try headers.append("Content-Type", "application/x-git-upload-pack-request"); + try headers.append("Git-Protocol", "version=2"); + + var body = std.ArrayListUnmanaged(u8){}; + defer body.deinit(allocator); + const body_writer = body.writer(allocator); + try Packet.write(.{ .data = "command=ls-refs\n" }, body_writer); + if (session.supports_agent) { + try Packet.write(.{ .data = agent_capability }, body_writer); + } + try Packet.write(.delimiter, body_writer); + for (options.ref_prefixes) |ref_prefix| { + const ref_prefix_packet = try std.fmt.allocPrint(allocator, "ref-prefix {s}\n", .{ref_prefix}); + defer allocator.free(ref_prefix_packet); + try Packet.write(.{ .data = ref_prefix_packet }, body_writer); + } + if (options.include_symrefs) { + try Packet.write(.{ .data = "symrefs\n" }, body_writer); + } + if (options.include_peeled) { + try Packet.write(.{ .data = "peel\n" }, body_writer); + } + try Packet.write(.flush, body_writer); + + var request = try session.transport.request(.POST, upload_pack_uri, headers, .{ + .handle_redirects = false, + }); + errdefer request.deinit(); + request.transfer_encoding = .{ .content_length = body.items.len }; + try request.start(.{}); + try request.writeAll(body.items); + try request.finish(); + + try request.wait(); + if (request.response.status != .ok) return error.ProtocolError; + + return .{ .request = request }; + } + + pub const RefIterator = struct { + request: std.http.Client.Request, + buf: [Packet.max_data_length]u8 = undefined, + + pub const Ref = struct { + oid: Oid, + name: []const u8, + symref_target: ?[]const u8, + peeled: ?Oid, + }; + + pub fn deinit(iterator: *RefIterator) void { + iterator.request.deinit(); + iterator.* = undefined; + } + + pub fn next(iterator: *RefIterator) !?Ref { + switch (try Packet.read(iterator.request.reader(), &iterator.buf)) { + .flush => return null, + .data => |data| { + const oid_sep_pos = mem.indexOfScalar(u8, data, ' ') orelse return error.InvalidRefPacket; + const oid = parseOid(data[0..oid_sep_pos]) catch return error.InvalidRefPacket; + + const name_sep_pos = mem.indexOfAnyPos(u8, data, oid_sep_pos + 1, " \n") orelse return error.InvalidRefPacket; + const name = data[oid_sep_pos + 1 .. name_sep_pos]; + + var symref_target: ?[]const u8 = null; + var peeled: ?Oid = null; + var last_sep_pos = name_sep_pos; + while (data[last_sep_pos] == ' ') { + const next_sep_pos = mem.indexOfAnyPos(u8, data, last_sep_pos + 1, " \n") orelse return error.InvalidRefPacket; + const attribute = data[last_sep_pos + 1 .. next_sep_pos]; + if (mem.startsWith(u8, attribute, "symref-target:")) { + symref_target = attribute["symref-target:".len..]; + } else if (mem.startsWith(u8, attribute, "peeled:")) { + peeled = parseOid(attribute["peeled:".len..]) catch return error.InvalidRefPacket; + } + last_sep_pos = next_sep_pos; + } + + return .{ .oid = oid, .name = name, .symref_target = symref_target, .peeled = peeled }; + }, + else => return error.UnexpectedPacket, + } + } + }; + + /// Fetches the given refs from the server. A shallow fetch (depth 1) is + /// performed if the server supports it. + pub fn fetch(session: Session, allocator: Allocator, wants: []const []const u8) !FetchStream { + var upload_pack_uri = session.uri; + upload_pack_uri.path = try std.fs.path.resolvePosix(allocator, &.{ "/", session.uri.path, "git-upload-pack" }); + defer allocator.free(upload_pack_uri.path); + upload_pack_uri.query = null; + upload_pack_uri.fragment = null; + + var headers = std.http.Headers.init(allocator); + defer headers.deinit(); + try headers.append("Content-Type", "application/x-git-upload-pack-request"); + try headers.append("Git-Protocol", "version=2"); + + var body = std.ArrayListUnmanaged(u8){}; + defer body.deinit(allocator); + const body_writer = body.writer(allocator); + try Packet.write(.{ .data = "command=fetch\n" }, body_writer); + if (session.supports_agent) { + try Packet.write(.{ .data = agent_capability }, body_writer); + } + try Packet.write(.delimiter, body_writer); + // Our packfile parser supports the OFS_DELTA object type + try Packet.write(.{ .data = "ofs-delta\n" }, body_writer); + // We do not currently convey server progress information to the user + try Packet.write(.{ .data = "no-progress\n" }, body_writer); + if (session.supports_shallow) { + try Packet.write(.{ .data = "deepen 1\n" }, body_writer); + } + for (wants) |want| { + var buf: [Packet.max_data_length]u8 = undefined; + const arg = std.fmt.bufPrint(&buf, "want {s}\n", .{want}) catch unreachable; + try Packet.write(.{ .data = arg }, body_writer); + } + try Packet.write(.{ .data = "done\n" }, body_writer); + try Packet.write(.flush, body_writer); + + var request = try session.transport.request(.POST, upload_pack_uri, headers, .{ + .handle_redirects = false, + }); + errdefer request.deinit(); + request.transfer_encoding = .{ .content_length = body.items.len }; + try request.start(.{}); + try request.writeAll(body.items); + try request.finish(); + + try request.wait(); + if (request.response.status != .ok) return error.ProtocolError; + + const reader = request.reader(); + // We are not interested in any of the sections of the returned fetch + // data other than the packfile section, since we aren't doing anything + // complex like ref negotiation (this is a fresh clone). + var state: enum { section_start, section_content } = .section_start; + while (true) { + var buf: [Packet.max_data_length]u8 = undefined; + const packet = try Packet.read(reader, &buf); + switch (state) { + .section_start => switch (packet) { + .data => |data| if (mem.eql(u8, data, "packfile\n")) { + return .{ .request = request }; + } else { + state = .section_content; + }, + else => return error.UnexpectedPacket, + }, + .section_content => switch (packet) { + .delimiter => state = .section_start, + .data => {}, + else => return error.UnexpectedPacket, + }, + } + } + } + + pub const FetchStream = struct { + request: std.http.Client.Request, + buf: [Packet.max_data_length]u8 = undefined, + pos: usize = 0, + len: usize = 0, + + pub fn deinit(stream: *FetchStream) void { + stream.request.deinit(); + } + + pub const ReadError = std.http.Client.Request.ReadError || error{ + InvalidPacket, + ProtocolError, + UnexpectedPacket, + }; + pub const Reader = std.io.Reader(*FetchStream, ReadError, read); + + const StreamCode = enum(u8) { + pack_data = 1, + progress = 2, + fatal_error = 3, + _, + }; + + pub fn reader(stream: *FetchStream) Reader { + return .{ .context = stream }; + } + + pub fn read(stream: *FetchStream, buf: []u8) !usize { + if (stream.pos == stream.len) { + while (true) { + switch (try Packet.read(stream.request.reader(), &stream.buf)) { + .flush => return 0, + .data => |data| if (data.len > 1) switch (@as(StreamCode, @enumFromInt(data[0]))) { + .pack_data => { + stream.pos = 1; + stream.len = data.len; + break; + }, + .fatal_error => return error.ProtocolError, + else => {}, + }, + else => return error.UnexpectedPacket, + } + } + } + + const size = @min(buf.len, stream.len - stream.pos); + @memcpy(buf[0..size], stream.buf[stream.pos .. stream.pos + size]); + stream.pos += size; + return size; + } + }; +}; + +const PackHeader = struct { + total_objects: u32, + + const signature = "PACK"; + const supported_version = 2; + + fn read(reader: anytype) !PackHeader { + const actual_signature = reader.readBytesNoEof(4) catch |e| switch (e) { + error.EndOfStream => return error.InvalidHeader, + else => |other| return other, + }; + if (!mem.eql(u8, &actual_signature, signature)) return error.InvalidHeader; + const version = reader.readIntBig(u32) catch |e| switch (e) { + error.EndOfStream => return error.InvalidHeader, + else => |other| return other, + }; + if (version != supported_version) return error.UnsupportedVersion; + const total_objects = reader.readIntBig(u32) catch |e| switch (e) { + error.EndOfStream => return error.InvalidHeader, + else => |other| return other, + }; + return .{ .total_objects = total_objects }; + } +}; + +const EntryHeader = union(Type) { + commit: Undeltified, + tree: Undeltified, + blob: Undeltified, + tag: Undeltified, + ofs_delta: OfsDelta, + ref_delta: RefDelta, + + const Type = enum(u3) { + commit = 1, + tree = 2, + blob = 3, + tag = 4, + ofs_delta = 6, + ref_delta = 7, + }; + + const Undeltified = struct { + uncompressed_length: u64, + }; + + const OfsDelta = struct { + offset: u64, + uncompressed_length: u64, + }; + + const RefDelta = struct { + base_object: Oid, + uncompressed_length: u64, + }; + + fn objectType(header: EntryHeader) Object.Type { + return switch (header) { + inline .commit, .tree, .blob, .tag => |_, tag| @field(Object.Type, @tagName(tag)), + else => unreachable, + }; + } + + fn uncompressedLength(header: EntryHeader) u64 { + return switch (header) { + inline else => |entry| entry.uncompressed_length, + }; + } + + fn read(reader: anytype) !EntryHeader { + const InitialByte = packed struct { len: u4, type: u3, has_next: bool }; + const initial: InitialByte = @bitCast(reader.readByte() catch |e| switch (e) { + error.EndOfStream => return error.InvalidFormat, + else => |other| return other, + }); + const rest_len = if (initial.has_next) try readSizeVarInt(reader) else 0; + var uncompressed_length: u64 = initial.len; + uncompressed_length |= std.math.shlExact(u64, rest_len, 4) catch return error.InvalidFormat; + const @"type" = std.meta.intToEnum(EntryHeader.Type, initial.type) catch return error.InvalidFormat; + return switch (@"type") { + inline .commit, .tree, .blob, .tag => |tag| @unionInit(EntryHeader, @tagName(tag), .{ + .uncompressed_length = uncompressed_length, + }), + .ofs_delta => .{ .ofs_delta = .{ + .offset = try readOffsetVarInt(reader), + .uncompressed_length = uncompressed_length, + } }, + .ref_delta => .{ .ref_delta = .{ + .base_object = reader.readBytesNoEof(oid_length) catch |e| switch (e) { + error.EndOfStream => return error.InvalidFormat, + else => |other| return other, + }, + .uncompressed_length = uncompressed_length, + } }, + }; + } +}; + +fn readSizeVarInt(r: anytype) !u64 { + const Byte = packed struct { value: u7, has_next: bool }; + var b: Byte = @bitCast(try r.readByte()); + var value: u64 = b.value; + var shift: u6 = 0; + while (b.has_next) { + b = @bitCast(try r.readByte()); + shift = std.math.add(u6, shift, 7) catch return error.InvalidFormat; + value |= @as(u64, b.value) << shift; + } + return value; +} + +fn readOffsetVarInt(r: anytype) !u64 { + const Byte = packed struct { value: u7, has_next: bool }; + var b: Byte = @bitCast(try r.readByte()); + var value: u64 = b.value; + while (b.has_next) { + b = @bitCast(try r.readByte()); + value = std.math.shlExact(u64, value + 1, 7) catch return error.InvalidFormat; + value |= b.value; + } + return value; +} + +const IndexHeader = struct { + fan_out_table: [256]u32, + + const signature = "\xFFtOc"; + const supported_version = 2; + const size = 4 + 4 + @sizeOf([256]u32); + + fn read(reader: anytype) !IndexHeader { + var header_bytes = try reader.readBytesNoEof(size); + if (!mem.eql(u8, header_bytes[0..4], signature)) return error.InvalidHeader; + const version = mem.readIntBig(u32, header_bytes[4..8]); + if (version != supported_version) return error.UnsupportedVersion; + + var fan_out_table: [256]u32 = undefined; + var fan_out_table_stream = std.io.fixedBufferStream(header_bytes[8..]); + const fan_out_table_reader = fan_out_table_stream.reader(); + for (&fan_out_table) |*entry| { + entry.* = fan_out_table_reader.readIntBig(u32) catch unreachable; + } + return .{ .fan_out_table = fan_out_table }; + } +}; + +const IndexEntry = struct { + offset: u64, + crc32: u32, +}; + +/// Writes out a version 2 index for the given packfile, as documented in +/// [pack-format](https://git-scm.com/docs/pack-format). +pub fn indexPack(allocator: Allocator, pack: std.fs.File, index_writer: anytype) !void { + try pack.seekTo(0); + + var index_entries = std.AutoHashMapUnmanaged(Oid, IndexEntry){}; + defer index_entries.deinit(allocator); + var pending_deltas = std.ArrayListUnmanaged(IndexEntry){}; + defer pending_deltas.deinit(allocator); + + const pack_checksum = try indexPackFirstPass(allocator, pack, &index_entries, &pending_deltas); + + var cache: ObjectCache = .{}; + defer cache.deinit(allocator); + var remaining_deltas = pending_deltas.items.len; + while (remaining_deltas > 0) { + var i: usize = remaining_deltas; + while (i > 0) { + i -= 1; + const delta = pending_deltas.items[i]; + if (try indexPackHashDelta(allocator, pack, delta, index_entries, &cache)) |oid| { + try index_entries.put(allocator, oid, delta); + _ = pending_deltas.swapRemove(i); + } + } + if (pending_deltas.items.len == remaining_deltas) return error.IncompletePack; + remaining_deltas = pending_deltas.items.len; + } + + var oids = std.ArrayListUnmanaged(Oid){}; + defer oids.deinit(allocator); + try oids.ensureTotalCapacityPrecise(allocator, index_entries.count()); + var index_entries_iter = index_entries.iterator(); + while (index_entries_iter.next()) |entry| { + oids.appendAssumeCapacity(entry.key_ptr.*); + } + mem.sortUnstable(Oid, oids.items, {}, struct { + fn lessThan(_: void, o1: Oid, o2: Oid) bool { + return mem.lessThan(u8, &o1, &o2); + } + }.lessThan); + + var fan_out_table: [256]u32 = undefined; + var count: u32 = 0; + var fan_out_index: u8 = 0; + for (oids.items) |oid| { + if (oid[0] > fan_out_index) { + @memset(fan_out_table[fan_out_index..oid[0]], count); + fan_out_index = oid[0]; + } + count += 1; + } + @memset(fan_out_table[fan_out_index..], count); + + var index_hashed_writer = hashedWriter(index_writer, Sha1.init(.{})); + const writer = index_hashed_writer.writer(); + try writer.writeAll(IndexHeader.signature); + try writer.writeIntBig(u32, IndexHeader.supported_version); + for (fan_out_table) |fan_out_entry| { + try writer.writeIntBig(u32, fan_out_entry); + } + + for (oids.items) |oid| { + try writer.writeAll(&oid); + } + + for (oids.items) |oid| { + try writer.writeIntBig(u32, index_entries.get(oid).?.crc32); + } + + var big_offsets = std.ArrayListUnmanaged(u64){}; + defer big_offsets.deinit(allocator); + for (oids.items) |oid| { + const offset = index_entries.get(oid).?.offset; + if (offset <= std.math.maxInt(u31)) { + try writer.writeIntBig(u32, @intCast(offset)); + } else { + const index = big_offsets.items.len; + try big_offsets.append(allocator, offset); + try writer.writeIntBig(u32, @as(u32, @intCast(index)) | (1 << 31)); + } + } + for (big_offsets.items) |offset| { + try writer.writeIntBig(u64, offset); + } + + try writer.writeAll(&pack_checksum); + const index_checksum = index_hashed_writer.hasher.finalResult(); + try index_writer.writeAll(&index_checksum); +} + +/// Performs the first pass over the packfile data for index construction. +/// This will index all non-delta objects, queue delta objects for further +/// processing, and return the pack checksum (which is part of the index +/// format). +fn indexPackFirstPass( + allocator: Allocator, + pack: std.fs.File, + index_entries: *std.AutoHashMapUnmanaged(Oid, IndexEntry), + pending_deltas: *std.ArrayListUnmanaged(IndexEntry), +) ![Sha1.digest_length]u8 { + var pack_buffered_reader = std.io.bufferedReader(pack.reader()); + var pack_counting_reader = std.io.countingReader(pack_buffered_reader.reader()); + var pack_hashed_reader = std.compress.hashedReader(pack_counting_reader.reader(), Sha1.init(.{})); + const pack_reader = pack_hashed_reader.reader(); + + const pack_header = try PackHeader.read(pack_reader); + + var current_entry: u32 = 0; + while (current_entry < pack_header.total_objects) : (current_entry += 1) { + const entry_offset = pack_counting_reader.bytes_read; + var entry_crc32_reader = std.compress.hashedReader(pack_reader, std.hash.Crc32.init()); + const entry_header = try EntryHeader.read(entry_crc32_reader.reader()); + switch (entry_header) { + inline .commit, .tree, .blob, .tag => |object, tag| { + var entry_decompress_stream = try std.compress.zlib.decompressStream(allocator, entry_crc32_reader.reader()); + defer entry_decompress_stream.deinit(); + var entry_counting_reader = std.io.countingReader(entry_decompress_stream.reader()); + var entry_hashed_writer = hashedWriter(std.io.null_writer, Sha1.init(.{})); + const entry_writer = entry_hashed_writer.writer(); + // The object header is not included in the pack data but is + // part of the object's ID + try entry_writer.print("{s} {}\x00", .{ @tagName(tag), object.uncompressed_length }); + var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); + try fifo.pump(entry_counting_reader.reader(), entry_writer); + if (entry_counting_reader.bytes_read != object.uncompressed_length) { + return error.InvalidObject; + } + const oid = entry_hashed_writer.hasher.finalResult(); + try index_entries.put(allocator, oid, .{ + .offset = entry_offset, + .crc32 = entry_crc32_reader.hasher.final(), + }); + }, + inline .ofs_delta, .ref_delta => |delta| { + var entry_decompress_stream = try std.compress.zlib.decompressStream(allocator, entry_crc32_reader.reader()); + defer entry_decompress_stream.deinit(); + var entry_counting_reader = std.io.countingReader(entry_decompress_stream.reader()); + var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); + try fifo.pump(entry_counting_reader.reader(), std.io.null_writer); + if (entry_counting_reader.bytes_read != delta.uncompressed_length) { + return error.InvalidObject; + } + try pending_deltas.append(allocator, .{ + .offset = entry_offset, + .crc32 = entry_crc32_reader.hasher.final(), + }); + }, + } + } + + const pack_checksum = pack_hashed_reader.hasher.finalResult(); + const recorded_checksum = try pack_buffered_reader.reader().readBytesNoEof(Sha1.digest_length); + if (!mem.eql(u8, &pack_checksum, &recorded_checksum)) { + return error.CorruptedPack; + } + _ = pack_buffered_reader.reader().readByte() catch |e| switch (e) { + error.EndOfStream => return pack_checksum, + else => |other| return other, + }; + return error.InvalidFormat; +} + +/// Attempts to determine the final object ID of the given deltified object. +/// May return null if this is not yet possible (if the delta is a ref-based +/// delta and we do not yet know the offset of the base object). +fn indexPackHashDelta( + allocator: Allocator, + pack: std.fs.File, + delta: IndexEntry, + index_entries: std.AutoHashMapUnmanaged(Oid, IndexEntry), + cache: *ObjectCache, +) !?Oid { + // Figure out the chain of deltas to resolve + var base_offset = delta.offset; + var base_header: EntryHeader = undefined; + var delta_offsets = std.ArrayListUnmanaged(u64){}; + defer delta_offsets.deinit(allocator); + const base_object = while (true) { + if (cache.get(base_offset)) |base_object| break base_object; + + try pack.seekTo(base_offset); + base_header = try EntryHeader.read(pack.reader()); + switch (base_header) { + .ofs_delta => |ofs_delta| { + try delta_offsets.append(allocator, base_offset); + base_offset = std.math.sub(u64, base_offset, ofs_delta.offset) catch return error.InvalidObject; + }, + .ref_delta => |ref_delta| { + try delta_offsets.append(allocator, base_offset); + base_offset = (index_entries.get(ref_delta.base_object) orelse return null).offset; + }, + else => { + const base_data = try readObjectRaw(allocator, pack.reader(), base_header.uncompressedLength()); + errdefer allocator.free(base_data); + const base_object: Object = .{ .type = base_header.objectType(), .data = base_data }; + try cache.put(allocator, base_offset, base_object); + break base_object; + }, + } + }; + + const base_data = try resolveDeltaChain(allocator, pack, base_object, delta_offsets.items, cache); + + var entry_hasher = Sha1.init(.{}); + var entry_hashed_writer = hashedWriter(std.io.null_writer, &entry_hasher); + try entry_hashed_writer.writer().print("{s} {}\x00", .{ @tagName(base_object.type), base_data.len }); + entry_hasher.update(base_data); + return entry_hasher.finalResult(); +} + +/// Resolves a chain of deltas, returning the final base object data. `pack` is +/// assumed to be looking at the start of the object data for the base object of +/// the chain, and will then apply the deltas in `delta_offsets` in reverse order +/// to obtain the final object. +fn resolveDeltaChain( + allocator: Allocator, + pack: std.fs.File, + base_object: Object, + delta_offsets: []const u64, + cache: *ObjectCache, +) ![]const u8 { + var base_data = base_object.data; + var i: usize = delta_offsets.len; + while (i > 0) { + i -= 1; + + const delta_offset = delta_offsets[i]; + try pack.seekTo(delta_offset); + const delta_header = try EntryHeader.read(pack.reader()); + var delta_data = try readObjectRaw(allocator, pack.reader(), delta_header.uncompressedLength()); + defer allocator.free(delta_data); + var delta_stream = std.io.fixedBufferStream(delta_data); + const delta_reader = delta_stream.reader(); + _ = try readSizeVarInt(delta_reader); // base object size + const expanded_size = try readSizeVarInt(delta_reader); + + const expanded_alloc_size = std.math.cast(usize, expanded_size) orelse return error.ObjectTooLarge; + var expanded_data = try allocator.alloc(u8, expanded_alloc_size); + errdefer allocator.free(expanded_data); + var expanded_delta_stream = std.io.fixedBufferStream(expanded_data); + var base_stream = std.io.fixedBufferStream(base_data); + try expandDelta(&base_stream, delta_reader, expanded_delta_stream.writer()); + if (expanded_delta_stream.pos != expanded_size) return error.InvalidObject; + + try cache.put(allocator, delta_offset, .{ .type = base_object.type, .data = expanded_data }); + base_data = expanded_data; + } + return base_data; +} + +/// Reads the complete contents of an object from `reader`. This function may +/// read more bytes than required from `reader`, so the reader position after +/// returning is not reliable. +fn readObjectRaw(allocator: Allocator, reader: anytype, size: u64) ![]u8 { + const alloc_size = std.math.cast(usize, size) orelse return error.ObjectTooLarge; + var buffered_reader = std.io.bufferedReader(reader); + var decompress_stream = try std.compress.zlib.decompressStream(allocator, buffered_reader.reader()); + defer decompress_stream.deinit(); + var data = try allocator.alloc(u8, alloc_size); + errdefer allocator.free(data); + try decompress_stream.reader().readNoEof(data); + _ = decompress_stream.reader().readByte() catch |e| switch (e) { + error.EndOfStream => return data, + else => |other| return other, + }; + return error.InvalidFormat; +} + +/// Expands delta data from `delta_reader` to `writer`. `base_object` must +/// support `reader` and `seekTo` (such as a `std.io.FixedBufferStream`). +/// +/// The format of the delta data is documented in +/// [pack-format](https://git-scm.com/docs/pack-format). +fn expandDelta(base_object: anytype, delta_reader: anytype, writer: anytype) !void { + while (true) { + const inst: packed struct { value: u7, copy: bool } = @bitCast(delta_reader.readByte() catch |e| switch (e) { + error.EndOfStream => return, + else => |other| return other, + }); + if (inst.copy) { + const available: packed struct { + offset1: bool, + offset2: bool, + offset3: bool, + offset4: bool, + size1: bool, + size2: bool, + size3: bool, + } = @bitCast(inst.value); + var offset_parts: packed struct { offset1: u8, offset2: u8, offset3: u8, offset4: u8 } = .{ + .offset1 = if (available.offset1) try delta_reader.readByte() else 0, + .offset2 = if (available.offset2) try delta_reader.readByte() else 0, + .offset3 = if (available.offset3) try delta_reader.readByte() else 0, + .offset4 = if (available.offset4) try delta_reader.readByte() else 0, + }; + const offset: u32 = @bitCast(offset_parts); + var size_parts: packed struct { size1: u8, size2: u8, size3: u8 } = .{ + .size1 = if (available.size1) try delta_reader.readByte() else 0, + .size2 = if (available.size2) try delta_reader.readByte() else 0, + .size3 = if (available.size3) try delta_reader.readByte() else 0, + }; + var size: u24 = @bitCast(size_parts); + if (size == 0) size = 0x10000; + try base_object.seekTo(offset); + var copy_reader = std.io.limitedReader(base_object.reader(), size); + var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); + try fifo.pump(copy_reader.reader(), writer); + } else if (inst.value != 0) { + var data_reader = std.io.limitedReader(delta_reader, inst.value); + var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); + try fifo.pump(data_reader.reader(), writer); + } else { + return error.InvalidDeltaInstruction; + } + } +} + +fn HashedWriter( + comptime WriterType: anytype, + comptime HasherType: anytype, +) type { + return struct { + child_writer: WriterType, + hasher: HasherType, + + const Error = WriterType.Error; + const Writer = std.io.Writer(*@This(), Error, write); + + fn write(hashed_writer: *@This(), buf: []const u8) Error!usize { + const amt = try hashed_writer.child_writer.write(buf); + hashed_writer.hasher.update(buf); + return amt; + } + + fn writer(hashed_writer: *@This()) Writer { + return .{ .context = hashed_writer }; + } + }; +} + +fn hashedWriter( + writer: anytype, + hasher: anytype, +) HashedWriter(@TypeOf(writer), @TypeOf(hasher)) { + return .{ .child_writer = writer, .hasher = hasher }; +} + +test "packfile indexing and checkout" { + // To verify the contents of this packfile without using the code in this + // file: + // + // 1. Create a new empty Git repository (`git init`) + // 2. `git unpack-objects <path/to/testdata.pack` + // 3. `git fsck` -> note the "dangling commit" ID (which matches the commit + // checked out below) + // 4. `git checkout dd582c0720819ab7130b103635bd7271b9fd4feb` + const testrepo_pack = @embedFile("git/testdata/testrepo.pack"); + + var git_dir = testing.tmpDir(.{}); + defer git_dir.cleanup(); + var pack_file = try git_dir.dir.createFile("testrepo.pack", .{ .read = true }); + defer pack_file.close(); + try pack_file.writeAll(testrepo_pack); + + var index_file = try git_dir.dir.createFile("testrepo.idx", .{ .read = true }); + defer index_file.close(); + try indexPack(testing.allocator, pack_file, index_file.writer()); + + // Arbitrary size limit on files read while checking the repository contents + // (all files in the test repo are known to be much smaller than this) + const max_file_size = 4096; + + const index_file_data = try git_dir.dir.readFileAlloc(testing.allocator, "testrepo.idx", max_file_size); + defer testing.allocator.free(index_file_data); + // testrepo.idx is generated by Git. The index created by this file should + // match it exactly. Running `git verify-pack -v testrepo.pack` can verify + // this. + const testrepo_idx = @embedFile("git/testdata/testrepo.idx"); + try testing.expectEqualSlices(u8, testrepo_idx, index_file_data); + + var repository = try Repository.init(testing.allocator, pack_file, index_file); + defer repository.deinit(); + + var worktree = testing.tmpIterableDir(.{}); + defer worktree.cleanup(); + + const commit_id = try parseOid("dd582c0720819ab7130b103635bd7271b9fd4feb"); + try repository.checkout(worktree.iterable_dir.dir, commit_id); + + const expected_files: []const []const u8 = &.{ + "dir/file", + "dir/subdir/file", + "dir/subdir/file2", + "dir2/file", + "dir3/file", + "dir3/file2", + "file", + "file2", + "file3", + "file4", + "file5", + "file6", + "file7", + "file8", + "file9", + }; + var actual_files: std.ArrayListUnmanaged([]u8) = .{}; + defer actual_files.deinit(testing.allocator); + defer for (actual_files.items) |file| testing.allocator.free(file); + var walker = try worktree.iterable_dir.walk(testing.allocator); + defer walker.deinit(); + while (try walker.next()) |entry| { + if (entry.kind != .file) continue; + var path = try testing.allocator.dupe(u8, entry.path); + errdefer testing.allocator.free(path); + mem.replaceScalar(u8, path, std.fs.path.sep, '/'); + try actual_files.append(testing.allocator, path); + } + mem.sortUnstable([]u8, actual_files.items, {}, struct { + fn lessThan(_: void, a: []u8, b: []u8) bool { + return mem.lessThan(u8, a, b); + } + }.lessThan); + try testing.expectEqualDeep(expected_files, actual_files.items); + + const expected_file_contents = + \\revision 1 + \\revision 2 + \\revision 4 + \\revision 5 + \\revision 7 + \\revision 8 + \\revision 9 + \\revision 10 + \\revision 12 + \\revision 13 + \\revision 14 + \\revision 18 + \\revision 19 + \\ + ; + const actual_file_contents = try worktree.iterable_dir.dir.readFileAlloc(testing.allocator, "file", max_file_size); + defer testing.allocator.free(actual_file_contents); + try testing.expectEqualStrings(expected_file_contents, actual_file_contents); +} + +/// Checks out a commit of a packfile. Intended for experimenting with and +/// benchmarking possible optimizations to the indexing and checkout behavior. +pub fn main() !void { + const allocator = std.heap.c_allocator; + + const args = try std.process.argsAlloc(allocator); + defer std.process.argsFree(allocator, args); + if (args.len != 4) { + return error.InvalidArguments; // Arguments: packfile commit worktree + } + + var pack_file = try std.fs.cwd().openFile(args[1], .{}); + defer pack_file.close(); + const commit = try parseOid(args[2]); + var worktree = try std.fs.cwd().makeOpenPath(args[3], .{}); + defer worktree.close(); + + var git_dir = try worktree.makeOpenPath(".git", .{}); + defer git_dir.close(); + + std.debug.print("Starting index...\n", .{}); + var index_file = try git_dir.createFile("idx", .{ .read = true }); + defer index_file.close(); + var index_buffered_writer = std.io.bufferedWriter(index_file.writer()); + try indexPack(allocator, pack_file, index_buffered_writer.writer()); + try index_buffered_writer.flush(); + try index_file.sync(); + + std.debug.print("Starting checkout...\n", .{}); + var repository = try Repository.init(allocator, pack_file, index_file); + defer repository.deinit(); + try repository.checkout(worktree, commit); +} diff --git a/src/Package/Fetch/git/testdata/testrepo.idx b/src/Package/Fetch/git/testdata/testrepo.idx Binary files differnew file mode 100644 index 0000000000..fdaba5ed48 --- /dev/null +++ b/src/Package/Fetch/git/testdata/testrepo.idx diff --git a/src/Package/Fetch/git/testdata/testrepo.pack b/src/Package/Fetch/git/testdata/testrepo.pack Binary files differnew file mode 100644 index 0000000000..1b2f5dd650 --- /dev/null +++ b/src/Package/Fetch/git/testdata/testrepo.pack diff --git a/src/Package/Manifest.zig b/src/Package/Manifest.zig new file mode 100644 index 0000000000..c1b1cdfb4f --- /dev/null +++ b/src/Package/Manifest.zig @@ -0,0 +1,566 @@ +pub const max_bytes = 10 * 1024 * 1024; +pub const basename = "build.zig.zon"; +pub const Hash = std.crypto.hash.sha2.Sha256; +pub const Digest = [Hash.digest_length]u8; +pub const multihash_len = 1 + 1 + Hash.digest_length; +pub const multihash_hex_digest_len = 2 * multihash_len; +pub const MultiHashHexDigest = [multihash_hex_digest_len]u8; + +pub const Dependency = struct { + location: Location, + location_tok: Ast.TokenIndex, + hash: ?[]const u8, + hash_tok: Ast.TokenIndex, + + pub const Location = union(enum) { + url: []const u8, + path: []const u8, + }; +}; + +pub const ErrorMessage = struct { + msg: []const u8, + tok: Ast.TokenIndex, + off: u32, +}; + +pub const MultihashFunction = enum(u16) { + identity = 0x00, + sha1 = 0x11, + @"sha2-256" = 0x12, + @"sha2-512" = 0x13, + @"sha3-512" = 0x14, + @"sha3-384" = 0x15, + @"sha3-256" = 0x16, + @"sha3-224" = 0x17, + @"sha2-384" = 0x20, + @"sha2-256-trunc254-padded" = 0x1012, + @"sha2-224" = 0x1013, + @"sha2-512-224" = 0x1014, + @"sha2-512-256" = 0x1015, + @"blake2b-256" = 0xb220, + _, +}; + +pub const multihash_function: MultihashFunction = switch (Hash) { + std.crypto.hash.sha2.Sha256 => .@"sha2-256", + else => @compileError("unreachable"), +}; +comptime { + // We avoid unnecessary uleb128 code in hexDigest by asserting here the + // values are small enough to be contained in the one-byte encoding. + assert(@intFromEnum(multihash_function) < 127); + assert(Hash.digest_length < 127); +} + +name: []const u8, +version: std.SemanticVersion, +dependencies: std.StringArrayHashMapUnmanaged(Dependency), +paths: std.StringArrayHashMapUnmanaged(void), + +errors: []ErrorMessage, +arena_state: std.heap.ArenaAllocator.State, + +pub const ParseOptions = struct { + allow_missing_paths_field: bool = false, +}; + +pub const Error = Allocator.Error; + +pub fn parse(gpa: Allocator, ast: std.zig.Ast, options: ParseOptions) Error!Manifest { + const node_tags = ast.nodes.items(.tag); + const node_datas = ast.nodes.items(.data); + assert(node_tags[0] == .root); + const main_node_index = node_datas[0].lhs; + + var arena_instance = std.heap.ArenaAllocator.init(gpa); + errdefer arena_instance.deinit(); + + var p: Parse = .{ + .gpa = gpa, + .ast = ast, + .arena = arena_instance.allocator(), + .errors = .{}, + + .name = undefined, + .version = undefined, + .dependencies = .{}, + .paths = .{}, + .allow_missing_paths_field = options.allow_missing_paths_field, + .buf = .{}, + }; + defer p.buf.deinit(gpa); + defer p.errors.deinit(gpa); + defer p.dependencies.deinit(gpa); + defer p.paths.deinit(gpa); + + p.parseRoot(main_node_index) catch |err| switch (err) { + error.ParseFailure => assert(p.errors.items.len > 0), + else => |e| return e, + }; + + return .{ + .name = p.name, + .version = p.version, + .dependencies = try p.dependencies.clone(p.arena), + .paths = try p.paths.clone(p.arena), + .errors = try p.arena.dupe(ErrorMessage, p.errors.items), + .arena_state = arena_instance.state, + }; +} + +pub fn deinit(man: *Manifest, gpa: Allocator) void { + man.arena_state.promote(gpa).deinit(); + man.* = undefined; +} + +const hex_charset = "0123456789abcdef"; + +pub fn hex64(x: u64) [16]u8 { + var result: [16]u8 = undefined; + var i: usize = 0; + while (i < 8) : (i += 1) { + const byte = @as(u8, @truncate(x >> @as(u6, @intCast(8 * i)))); + result[i * 2 + 0] = hex_charset[byte >> 4]; + result[i * 2 + 1] = hex_charset[byte & 15]; + } + return result; +} + +test hex64 { + const s = "[" ++ hex64(0x12345678_abcdef00) ++ "]"; + try std.testing.expectEqualStrings("[00efcdab78563412]", s); +} + +pub fn hexDigest(digest: Digest) MultiHashHexDigest { + var result: MultiHashHexDigest = undefined; + + result[0] = hex_charset[@intFromEnum(multihash_function) >> 4]; + result[1] = hex_charset[@intFromEnum(multihash_function) & 15]; + + result[2] = hex_charset[Hash.digest_length >> 4]; + result[3] = hex_charset[Hash.digest_length & 15]; + + for (digest, 0..) |byte, i| { + result[4 + i * 2] = hex_charset[byte >> 4]; + result[5 + i * 2] = hex_charset[byte & 15]; + } + return result; +} + +const Parse = struct { + gpa: Allocator, + ast: std.zig.Ast, + arena: Allocator, + buf: std.ArrayListUnmanaged(u8), + errors: std.ArrayListUnmanaged(ErrorMessage), + + name: []const u8, + version: std.SemanticVersion, + dependencies: std.StringArrayHashMapUnmanaged(Dependency), + paths: std.StringArrayHashMapUnmanaged(void), + allow_missing_paths_field: bool, + + const InnerError = error{ ParseFailure, OutOfMemory }; + + fn parseRoot(p: *Parse, node: Ast.Node.Index) !void { + const ast = p.ast; + const main_tokens = ast.nodes.items(.main_token); + const main_token = main_tokens[node]; + + var buf: [2]Ast.Node.Index = undefined; + const struct_init = ast.fullStructInit(&buf, node) orelse { + return fail(p, main_token, "expected top level expression to be a struct", .{}); + }; + + var have_name = false; + var have_version = false; + var have_included_paths = false; + + for (struct_init.ast.fields) |field_init| { + const name_token = ast.firstToken(field_init) - 2; + const field_name = try identifierTokenString(p, name_token); + // We could get fancy with reflection and comptime logic here but doing + // things manually provides an opportunity to do any additional verification + // that is desirable on a per-field basis. + if (mem.eql(u8, field_name, "dependencies")) { + try parseDependencies(p, field_init); + } else if (mem.eql(u8, field_name, "paths")) { + have_included_paths = true; + try parseIncludedPaths(p, field_init); + } else if (mem.eql(u8, field_name, "name")) { + p.name = try parseString(p, field_init); + have_name = true; + } else if (mem.eql(u8, field_name, "version")) { + const version_text = try parseString(p, field_init); + p.version = std.SemanticVersion.parse(version_text) catch |err| v: { + try appendError(p, main_tokens[field_init], "unable to parse semantic version: {s}", .{@errorName(err)}); + break :v undefined; + }; + have_version = true; + } else { + // Ignore unknown fields so that we can add fields in future zig + // versions without breaking older zig versions. + } + } + + if (!have_name) { + try appendError(p, main_token, "missing top-level 'name' field", .{}); + } + + if (!have_version) { + try appendError(p, main_token, "missing top-level 'version' field", .{}); + } + + if (!have_included_paths) { + if (p.allow_missing_paths_field) { + try p.paths.put(p.gpa, "", {}); + } else { + try appendError(p, main_token, "missing top-level 'paths' field", .{}); + } + } + } + + fn parseDependencies(p: *Parse, node: Ast.Node.Index) !void { + const ast = p.ast; + const main_tokens = ast.nodes.items(.main_token); + + var buf: [2]Ast.Node.Index = undefined; + const struct_init = ast.fullStructInit(&buf, node) orelse { + const tok = main_tokens[node]; + return fail(p, tok, "expected dependencies expression to be a struct", .{}); + }; + + for (struct_init.ast.fields) |field_init| { + const name_token = ast.firstToken(field_init) - 2; + const dep_name = try identifierTokenString(p, name_token); + const dep = try parseDependency(p, field_init); + try p.dependencies.put(p.gpa, dep_name, dep); + } + } + + fn parseDependency(p: *Parse, node: Ast.Node.Index) !Dependency { + const ast = p.ast; + const main_tokens = ast.nodes.items(.main_token); + + var buf: [2]Ast.Node.Index = undefined; + const struct_init = ast.fullStructInit(&buf, node) orelse { + const tok = main_tokens[node]; + return fail(p, tok, "expected dependency expression to be a struct", .{}); + }; + + var dep: Dependency = .{ + .location = undefined, + .location_tok = 0, + .hash = null, + .hash_tok = 0, + }; + var has_location = false; + + for (struct_init.ast.fields) |field_init| { + const name_token = ast.firstToken(field_init) - 2; + const field_name = try identifierTokenString(p, name_token); + // We could get fancy with reflection and comptime logic here but doing + // things manually provides an opportunity to do any additional verification + // that is desirable on a per-field basis. + if (mem.eql(u8, field_name, "url")) { + if (has_location) { + return fail(p, main_tokens[field_init], "dependency should specify only one of 'url' and 'path' fields.", .{}); + } + dep.location = .{ + .url = parseString(p, field_init) catch |err| switch (err) { + error.ParseFailure => continue, + else => |e| return e, + }, + }; + has_location = true; + dep.location_tok = main_tokens[field_init]; + } else if (mem.eql(u8, field_name, "path")) { + if (has_location) { + return fail(p, main_tokens[field_init], "dependency should specify only one of 'url' and 'path' fields.", .{}); + } + dep.location = .{ + .path = parseString(p, field_init) catch |err| switch (err) { + error.ParseFailure => continue, + else => |e| return e, + }, + }; + has_location = true; + dep.location_tok = main_tokens[field_init]; + } else if (mem.eql(u8, field_name, "hash")) { + dep.hash = parseHash(p, field_init) catch |err| switch (err) { + error.ParseFailure => continue, + else => |e| return e, + }; + dep.hash_tok = main_tokens[field_init]; + } else { + // Ignore unknown fields so that we can add fields in future zig + // versions without breaking older zig versions. + } + } + + if (!has_location) { + try appendError(p, main_tokens[node], "dependency requires location field, one of 'url' or 'path'.", .{}); + } + + return dep; + } + + fn parseIncludedPaths(p: *Parse, node: Ast.Node.Index) !void { + const ast = p.ast; + const main_tokens = ast.nodes.items(.main_token); + + var buf: [2]Ast.Node.Index = undefined; + const array_init = ast.fullArrayInit(&buf, node) orelse { + const tok = main_tokens[node]; + return fail(p, tok, "expected paths expression to be a struct", .{}); + }; + + for (array_init.ast.elements) |elem_node| { + const path_string = try parseString(p, elem_node); + // This is normalized so that it can be used in string comparisons + // against file system paths. + const normalized = try std.fs.path.resolve(p.arena, &.{path_string}); + try p.paths.put(p.gpa, normalized, {}); + } + } + + fn parseString(p: *Parse, node: Ast.Node.Index) ![]const u8 { + const ast = p.ast; + const node_tags = ast.nodes.items(.tag); + const main_tokens = ast.nodes.items(.main_token); + if (node_tags[node] != .string_literal) { + return fail(p, main_tokens[node], "expected string literal", .{}); + } + const str_lit_token = main_tokens[node]; + const token_bytes = ast.tokenSlice(str_lit_token); + p.buf.clearRetainingCapacity(); + try parseStrLit(p, str_lit_token, &p.buf, token_bytes, 0); + const duped = try p.arena.dupe(u8, p.buf.items); + return duped; + } + + fn parseHash(p: *Parse, node: Ast.Node.Index) ![]const u8 { + const ast = p.ast; + const main_tokens = ast.nodes.items(.main_token); + const tok = main_tokens[node]; + const h = try parseString(p, node); + + if (h.len >= 2) { + const their_multihash_func = std.fmt.parseInt(u8, h[0..2], 16) catch |err| { + return fail(p, tok, "invalid multihash value: unable to parse hash function: {s}", .{ + @errorName(err), + }); + }; + if (@as(MultihashFunction, @enumFromInt(their_multihash_func)) != multihash_function) { + return fail(p, tok, "unsupported hash function: only sha2-256 is supported", .{}); + } + } + + if (h.len != multihash_hex_digest_len) { + return fail(p, tok, "wrong hash size. expected: {d}, found: {d}", .{ + multihash_hex_digest_len, h.len, + }); + } + + return h; + } + + /// TODO: try to DRY this with AstGen.identifierTokenString + fn identifierTokenString(p: *Parse, token: Ast.TokenIndex) InnerError![]const u8 { + const ast = p.ast; + const token_tags = ast.tokens.items(.tag); + assert(token_tags[token] == .identifier); + const ident_name = ast.tokenSlice(token); + if (!mem.startsWith(u8, ident_name, "@")) { + return ident_name; + } + p.buf.clearRetainingCapacity(); + try parseStrLit(p, token, &p.buf, ident_name, 1); + const duped = try p.arena.dupe(u8, p.buf.items); + return duped; + } + + /// TODO: try to DRY this with AstGen.parseStrLit + fn parseStrLit( + p: *Parse, + token: Ast.TokenIndex, + buf: *std.ArrayListUnmanaged(u8), + bytes: []const u8, + offset: u32, + ) InnerError!void { + const raw_string = bytes[offset..]; + var buf_managed = buf.toManaged(p.gpa); + const result = std.zig.string_literal.parseWrite(buf_managed.writer(), raw_string); + buf.* = buf_managed.moveToUnmanaged(); + switch (try result) { + .success => {}, + .failure => |err| try p.appendStrLitError(err, token, bytes, offset), + } + } + + /// TODO: try to DRY this with AstGen.failWithStrLitError + fn appendStrLitError( + p: *Parse, + err: std.zig.string_literal.Error, + token: Ast.TokenIndex, + bytes: []const u8, + offset: u32, + ) Allocator.Error!void { + const raw_string = bytes[offset..]; + switch (err) { + .invalid_escape_character => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "invalid escape character: '{c}'", + .{raw_string[bad_index]}, + ); + }, + .expected_hex_digit => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "expected hex digit, found '{c}'", + .{raw_string[bad_index]}, + ); + }, + .empty_unicode_escape_sequence => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "empty unicode escape sequence", + .{}, + ); + }, + .expected_hex_digit_or_rbrace => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "expected hex digit or '}}', found '{c}'", + .{raw_string[bad_index]}, + ); + }, + .invalid_unicode_codepoint => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "unicode escape does not correspond to a valid codepoint", + .{}, + ); + }, + .expected_lbrace => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "expected '{{', found '{c}", + .{raw_string[bad_index]}, + ); + }, + .expected_rbrace => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "expected '}}', found '{c}", + .{raw_string[bad_index]}, + ); + }, + .expected_single_quote => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "expected single quote ('), found '{c}", + .{raw_string[bad_index]}, + ); + }, + .invalid_character => |bad_index| { + try p.appendErrorOff( + token, + offset + @as(u32, @intCast(bad_index)), + "invalid byte in string or character literal: '{c}'", + .{raw_string[bad_index]}, + ); + }, + } + } + + fn fail( + p: *Parse, + tok: Ast.TokenIndex, + comptime fmt: []const u8, + args: anytype, + ) InnerError { + try appendError(p, tok, fmt, args); + return error.ParseFailure; + } + + fn appendError(p: *Parse, tok: Ast.TokenIndex, comptime fmt: []const u8, args: anytype) !void { + return appendErrorOff(p, tok, 0, fmt, args); + } + + fn appendErrorOff( + p: *Parse, + tok: Ast.TokenIndex, + byte_offset: u32, + comptime fmt: []const u8, + args: anytype, + ) Allocator.Error!void { + try p.errors.append(p.gpa, .{ + .msg = try std.fmt.allocPrint(p.arena, fmt, args), + .tok = tok, + .off = byte_offset, + }); + } +}; + +const Manifest = @This(); +const std = @import("std"); +const mem = std.mem; +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; +const Ast = std.zig.Ast; +const testing = std.testing; + +test "basic" { + const gpa = testing.allocator; + + const example = + \\.{ + \\ .name = "foo", + \\ .version = "3.2.1", + \\ .dependencies = .{ + \\ .bar = .{ + \\ .url = "https://example.com/baz.tar.gz", + \\ .hash = "1220f1b680b6065fcfc94fe777f22e73bcb7e2767e5f4d99d4255fe76ded69c7a35f", + \\ }, + \\ }, + \\} + ; + + var ast = try std.zig.Ast.parse(gpa, example, .zon); + defer ast.deinit(gpa); + + try testing.expect(ast.errors.len == 0); + + var manifest = try Manifest.parse(gpa, ast); + defer manifest.deinit(gpa); + + try testing.expectEqualStrings("foo", manifest.name); + + try testing.expectEqual(@as(std.SemanticVersion, .{ + .major = 3, + .minor = 2, + .patch = 1, + }), manifest.version); + + try testing.expect(manifest.dependencies.count() == 1); + try testing.expectEqualStrings("bar", manifest.dependencies.keys()[0]); + try testing.expectEqualStrings( + "https://example.com/baz.tar.gz", + manifest.dependencies.values()[0].url, + ); + try testing.expectEqualStrings( + "1220f1b680b6065fcfc94fe777f22e73bcb7e2767e5f4d99d4255fe76ded69c7a35f", + manifest.dependencies.values()[0].hash orelse return error.TestFailed, + ); +} diff --git a/src/Package/Module.zig b/src/Package/Module.zig new file mode 100644 index 0000000000..7e6b518892 --- /dev/null +++ b/src/Package/Module.zig @@ -0,0 +1,34 @@ +//! Corresponds to something that Zig source code can `@import`. +//! Not to be confused with src/Module.zig which should be renamed +//! to something else. https://github.com/ziglang/zig/issues/14307 + +/// Only files inside this directory can be imported. +root: Package.Path, +/// Relative to `root`. May contain path separators. +root_src_path: []const u8, +/// Name used in compile errors. Looks like "root.foo.bar". +fully_qualified_name: []const u8, +/// The dependency table of this module. Shared dependencies such as 'std', +/// 'builtin', and 'root' are not specified in every dependency table, but +/// instead only in the table of `main_mod`. `Module.importFile` is +/// responsible for detecting these names and using the correct package. +deps: Deps = .{}, + +pub const Deps = std.StringHashMapUnmanaged(*Module); + +pub const Tree = struct { + /// Each `Package` exposes a `Module` with build.zig as its root source file. + build_module_table: std.AutoArrayHashMapUnmanaged(MultiHashHexDigest, *Module), +}; + +pub fn create(allocator: Allocator, m: Module) Allocator.Error!*Module { + const new = try allocator.create(Module); + new.* = m; + return new; +} + +const Module = @This(); +const Package = @import("../Package.zig"); +const std = @import("std"); +const Allocator = std.mem.Allocator; +const MultiHashHexDigest = Package.Manifest.MultiHashHexDigest; diff --git a/src/Package/hash.zig b/src/Package/hash.zig deleted file mode 100644 index b14ec70244..0000000000 --- a/src/Package/hash.zig +++ /dev/null @@ -1,153 +0,0 @@ -const builtin = @import("builtin"); -const std = @import("std"); -const fs = std.fs; -const ThreadPool = std.Thread.Pool; -const WaitGroup = std.Thread.WaitGroup; -const Allocator = std.mem.Allocator; - -const Hash = @import("../Manifest.zig").Hash; - -pub fn compute(thread_pool: *ThreadPool, pkg_dir: fs.IterableDir) ![Hash.digest_length]u8 { - const gpa = thread_pool.allocator; - - // We'll use an arena allocator for the path name strings since they all - // need to be in memory for sorting. - var arena_instance = std.heap.ArenaAllocator.init(gpa); - defer arena_instance.deinit(); - const arena = arena_instance.allocator(); - - // TODO: delete files not included in the package prior to computing the package hash. - // for example, if the ini file has directives to include/not include certain files, - // apply those rules directly to the filesystem right here. This ensures that files - // not protected by the hash are not present on the file system. - - // Collect all files, recursively, then sort. - var all_files = std.ArrayList(*HashedFile).init(gpa); - defer all_files.deinit(); - - var walker = try pkg_dir.walk(gpa); - defer walker.deinit(); - - { - // The final hash will be a hash of each file hashed independently. This - // allows hashing in parallel. - var wait_group: WaitGroup = .{}; - defer wait_group.wait(); - - while (try walker.next()) |entry| { - const kind: HashedFile.Kind = switch (entry.kind) { - .directory => continue, - .file => .file, - .sym_link => .sym_link, - else => return error.IllegalFileTypeInPackage, - }; - const hashed_file = try arena.create(HashedFile); - const fs_path = try arena.dupe(u8, entry.path); - hashed_file.* = .{ - .fs_path = fs_path, - .normalized_path = try normalizePath(arena, fs_path), - .kind = kind, - .hash = undefined, // to be populated by the worker - .failure = undefined, // to be populated by the worker - }; - wait_group.start(); - try thread_pool.spawn(workerHashFile, .{ pkg_dir.dir, hashed_file, &wait_group }); - - try all_files.append(hashed_file); - } - } - - std.mem.sortUnstable(*HashedFile, all_files.items, {}, HashedFile.lessThan); - - var hasher = Hash.init(.{}); - var any_failures = false; - for (all_files.items) |hashed_file| { - hashed_file.failure catch |err| { - any_failures = true; - std.log.err("unable to hash '{s}': {s}", .{ hashed_file.fs_path, @errorName(err) }); - }; - hasher.update(&hashed_file.hash); - } - if (any_failures) return error.PackageHashUnavailable; - return hasher.finalResult(); -} - -const HashedFile = struct { - fs_path: []const u8, - normalized_path: []const u8, - hash: [Hash.digest_length]u8, - failure: Error!void, - kind: Kind, - - const Error = - fs.File.OpenError || - fs.File.ReadError || - fs.File.StatError || - fs.Dir.ReadLinkError; - - const Kind = enum { file, sym_link }; - - fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool { - _ = context; - return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path); - } -}; - -/// Make a file system path identical independently of operating system path inconsistencies. -/// This converts backslashes into forward slashes. -fn normalizePath(arena: Allocator, fs_path: []const u8) ![]const u8 { - const canonical_sep = '/'; - - if (fs.path.sep == canonical_sep) - return fs_path; - - const normalized = try arena.dupe(u8, fs_path); - for (normalized) |*byte| { - switch (byte.*) { - fs.path.sep => byte.* = canonical_sep, - else => continue, - } - } - return normalized; -} - -fn workerHashFile(dir: fs.Dir, hashed_file: *HashedFile, wg: *WaitGroup) void { - defer wg.finish(); - hashed_file.failure = hashFileFallible(dir, hashed_file); -} - -fn hashFileFallible(dir: fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void { - var buf: [8000]u8 = undefined; - var hasher = Hash.init(.{}); - hasher.update(hashed_file.normalized_path); - switch (hashed_file.kind) { - .file => { - var file = try dir.openFile(hashed_file.fs_path, .{}); - defer file.close(); - hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) }); - while (true) { - const bytes_read = try file.read(&buf); - if (bytes_read == 0) break; - hasher.update(buf[0..bytes_read]); - } - }, - .sym_link => { - const link_name = try dir.readLink(hashed_file.fs_path, &buf); - hasher.update(link_name); - }, - } - hasher.final(&hashed_file.hash); -} - -fn isExecutable(file: fs.File) !bool { - if (builtin.os.tag == .windows) { - // TODO check the ACL on Windows. - // Until this is implemented, this could be a false negative on - // Windows, which is why we do not yet set executable_bit_only above - // when unpacking the tarball. - return false; - } else { - const stat = try file.stat(); - return (stat.mode & std.os.S.IXUSR) != 0; - } -} |
