Merge pull request #19388 from ziglang/cache-dedup

cache system file deduplication
author: Andrew Kelley <andrew@ziglang.org> 2024-03-22 01:13:43 -0700
committer: GitHub <noreply@github.com> 2024-03-22 01:13:43 -0700
commit: a2651cbc829d44df4c3773037598b30e8cf0c4da (patch)
tree: 555c74b10683ae9678c68777310116f47142a8aa /lib/std/Build/Cache.zig
parent: 54c08579e4859673391843182aa2fd44aabbf6cf (diff)
parent: 950359071bca707dbc9763f1bf3ebc79cd52ebca (diff)
download: zig-a2651cbc829d44df4c3773037598b30e8cf0c4da.tar.gz
zig-a2651cbc829d44df4c3773037598b30e8cf0c4da.zip
1 files changed, 151 insertions, 140 deletions
diff --git a/lib/std/Build/Cache.zig b/lib/std/Build/Cache.zig
index c18a748de2..c4145a9f3f 100644
--- a/lib/std/Build/Cache.zig
+++ b/lib/std/Build/Cache.zig
@@ -2,77 +2,6 @@
 //! This is not a general-purpose cache. It is designed to be fast and simple,
 //! not to withstand attacks using specially-crafted input.
 
-pub const Directory = struct {
-    /// This field is redundant for operations that can act on the open directory handle
-    /// directly, but it is needed when passing the directory to a child process.
-    /// `null` means cwd.
-    path: ?[]const u8,
-    handle: fs.Dir,
-
-    pub fn clone(d: Directory, arena: Allocator) Allocator.Error!Directory {
-        return .{
-            .path = if (d.path) |p| try arena.dupe(u8, p) else null,
-            .handle = d.handle,
-        };
-    }
-
-    pub fn cwd() Directory {
-        return .{
-            .path = null,
-            .handle = fs.cwd(),
-        };
-    }
-
-    pub fn join(self: Directory, allocator: Allocator, paths: []const []const u8) ![]u8 {
-        if (self.path) |p| {
-            // TODO clean way to do this with only 1 allocation
-            const part2 = try fs.path.join(allocator, paths);
-            defer allocator.free(part2);
-            return fs.path.join(allocator, &[_][]const u8{ p, part2 });
-        } else {
-            return fs.path.join(allocator, paths);
-        }
-    }
-
-    pub fn joinZ(self: Directory, allocator: Allocator, paths: []const []const u8) ![:0]u8 {
-        if (self.path) |p| {
-            // TODO clean way to do this with only 1 allocation
-            const part2 = try fs.path.join(allocator, paths);
-            defer allocator.free(part2);
-            return fs.path.joinZ(allocator, &[_][]const u8{ p, part2 });
-        } else {
-            return fs.path.joinZ(allocator, paths);
-        }
-    }
-
-    /// Whether or not the handle should be closed, or the path should be freed
-    /// is determined by usage, however this function is provided for convenience
-    /// if it happens to be what the caller needs.
-    pub fn closeAndFree(self: *Directory, gpa: Allocator) void {
-        self.handle.close();
-        if (self.path) |p| gpa.free(p);
-        self.* = undefined;
-    }
-
-    pub fn format(
-        self: Directory,
-        comptime fmt_string: []const u8,
-        options: fmt.FormatOptions,
-        writer: anytype,
-    ) !void {
-        _ = options;
-        if (fmt_string.len != 0) fmt.invalidFmtError(fmt_string, self);
-        if (self.path) |p| {
-            try writer.writeAll(p);
-            try writer.writeAll(fs.path.sep_str);
-        }
-    }
-
-    pub fn eql(self: Directory, other: Directory) bool {
-        return self.handle.fd == other.handle.fd;
-    }
-};
-
 gpa: Allocator,
 manifest_dir: fs.Dir,
 hash: HashHelper = .{},
@@ -88,6 +17,8 @@ mutex: std.Thread.Mutex = .{},
 prefixes_buffer: [4]Directory = undefined,
 prefixes_len: usize = 0,
 
+pub const Path = @import("Cache/Path.zig");
+pub const Directory = @import("Cache/Directory.zig");
 pub const DepTokenizer = @import("Cache/DepTokenizer.zig");
 
 const Cache = @This();
@@ -124,7 +55,15 @@ pub fn prefixes(cache: *const Cache) []const Directory {
 
 const PrefixedPath = struct {
     prefix: u8,
-    sub_path: []u8,
+    sub_path: []const u8,
+
+    fn eql(a: PrefixedPath, b: PrefixedPath) bool {
+        return a.prefix == b.prefix and std.mem.eql(u8, a.sub_path, b.sub_path);
+    }
+
+    fn hash(pp: PrefixedPath) u32 {
+        return @truncate(std.hash.Wyhash.hash(pp.prefix, pp.sub_path));
+    }
 };
 
 fn findPrefix(cache: *const Cache, file_path: []const u8) !PrefixedPath {
@@ -183,7 +122,7 @@ pub const HexDigest = [hex_digest_len]u8;
 
 /// This is currently just an arbitrary non-empty string that can't match another manifest line.
 const manifest_header = "0";
-const manifest_file_size_max = 50 * 1024 * 1024;
+const manifest_file_size_max = 100 * 1024 * 1024;
 
 /// The type used for hashing file contents. Currently, this is SipHash128(1, 3), because it
 /// provides enough collision resistance for the Manifest use cases, while being one of our
@@ -201,7 +140,7 @@ pub const hasher_init: Hasher = Hasher.init(&[_]u8{
 });
 
 pub const File = struct {
-    prefixed_path: ?PrefixedPath,
+    prefixed_path: PrefixedPath,
     max_file_size: ?usize,
     stat: Stat,
     bin_digest: BinDigest,
@@ -214,16 +153,18 @@ pub const File = struct {
     };
 
     pub fn deinit(self: *File, gpa: Allocator) void {
-        if (self.prefixed_path) |pp| {
-            gpa.free(pp.sub_path);
-            self.prefixed_path = null;
-        }
+        gpa.free(self.prefixed_path.sub_path);
         if (self.contents) |contents| {
             gpa.free(contents);
             self.contents = null;
         }
         self.* = undefined;
     }
+
+    pub fn updateMaxSize(file: *File, new_max_size: ?usize) void {
+        const new = new_max_size orelse return;
+        file.max_file_size = if (file.max_file_size) |old| @max(old, new) else new;
+    }
 };
 
 pub const HashHelper = struct {
@@ -365,7 +306,7 @@ pub const Manifest = struct {
     // order to obtain a problematic timestamp for the next call. Calls after that
     // will then use the same timestamp, to avoid unnecessary filesystem writes.
     want_refresh_timestamp: bool = true,
-    files: std.ArrayListUnmanaged(File) = .{},
+    files: Files = .{},
     hex_digest: HexDigest,
     /// Populated when hit() returns an error because of one
     /// of the files listed in the manifest.
@@ -374,6 +315,34 @@ pub const Manifest = struct {
     /// what time the file system thinks it is, according to its own granularity.
     recent_problematic_timestamp: i128 = 0,
 
+    pub const Files = std.ArrayHashMapUnmanaged(File, void, FilesContext, false);
+
+    pub const FilesContext = struct {
+        pub fn hash(fc: FilesContext, file: File) u32 {
+            _ = fc;
+            return file.prefixed_path.hash();
+        }
+
+        pub fn eql(fc: FilesContext, a: File, b: File, b_index: usize) bool {
+            _ = fc;
+            _ = b_index;
+            return a.prefixed_path.eql(b.prefixed_path);
+        }
+    };
+
+    const FilesAdapter = struct {
+        pub fn eql(context: @This(), a: PrefixedPath, b: File, b_index: usize) bool {
+            _ = context;
+            _ = b_index;
+            return a.eql(b.prefixed_path);
+        }
+
+        pub fn hash(context: @This(), key: PrefixedPath) u32 {
+            _ = context;
+            return key.hash();
+        }
+    };
+
     /// Add a file as a dependency of process being cached. When `hit` is
     /// called, the file's contents will be checked to ensure that it matches
     /// the contents from previous times.
@@ -386,7 +355,7 @@ pub const Manifest = struct {
     /// to access the contents of the file after calling `hit()` like so:
     ///
     /// ```
-    /// var file_contents = cache_hash.files.items[file_index].contents.?;
+    /// var file_contents = cache_hash.files.keys()[file_index].contents.?;
     /// ```
     pub fn addFile(self: *Manifest, file_path: []const u8, max_file_size: ?usize) !usize {
         assert(self.manifest_file == null);
@@ -396,7 +365,12 @@ pub const Manifest = struct {
         const prefixed_path = try self.cache.findPrefix(file_path);
         errdefer gpa.free(prefixed_path.sub_path);
 
-        self.files.addOneAssumeCapacity().* = .{
+        const gop = self.files.getOrPutAssumeCapacityAdapted(prefixed_path, FilesAdapter{});
+        if (gop.found_existing) {
+            gop.key_ptr.updateMaxSize(max_file_size);
+            return gop.index;
+        }
+        gop.key_ptr.* = .{
             .prefixed_path = prefixed_path,
             .contents = null,
             .max_file_size = max_file_size,
@@ -407,7 +381,7 @@ pub const Manifest = struct {
         self.hash.add(prefixed_path.prefix);
         self.hash.addBytes(prefixed_path.sub_path);
 
-        return self.files.items.len - 1;
+        return gop.index;
     }
 
     pub fn addOptionalFile(self: *Manifest, optional_file_path: ?[]const u8) !void {
@@ -487,7 +461,7 @@ pub const Manifest = struct {
 
         self.want_refresh_timestamp = true;
 
-        const input_file_count = self.files.items.len;
+        const input_file_count = self.files.entries.len;
         while (true) : (self.unhit(bin_digest, input_file_count)) {
             const file_contents = try self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max);
             defer gpa.free(file_contents);
@@ -499,7 +473,7 @@ pub const Manifest = struct {
                 if (try self.upgradeToExclusiveLock()) continue;
                 self.manifest_dirty = true;
                 while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.items[idx];
+                    const ch_file = &self.files.keys()[idx];
                     self.populateFileHash(ch_file) catch |err| {
                         self.failed_file_index = idx;
                         return err;
@@ -510,18 +484,6 @@ pub const Manifest = struct {
             while (line_iter.next()) |line| {
                 defer idx += 1;
 
-                const cache_hash_file = if (idx < input_file_count) &self.files.items[idx] else blk: {
-                    const new = try self.files.addOne(gpa);
-                    new.* = .{
-                        .prefixed_path = null,
-                        .contents = null,
-                        .max_file_size = null,
-                        .stat = undefined,
-                        .bin_digest = undefined,
-                    };
-                    break :blk new;
-                };
-
                 var iter = mem.tokenizeScalar(u8, line, ' ');
                 const size = iter.next() orelse return error.InvalidFormat;
                 const inode = iter.next() orelse return error.InvalidFormat;
@@ -530,30 +492,61 @@ pub const Manifest = struct {
                 const prefix_str = iter.next() orelse return error.InvalidFormat;
                 const file_path = iter.rest();
 
-                cache_hash_file.stat.size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
-                cache_hash_file.stat.inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
-                cache_hash_file.stat.mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
-                _ = fmt.hexToBytes(&cache_hash_file.bin_digest, digest_str) catch return error.InvalidFormat;
+                const stat_size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
+                const stat_inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
+                const stat_mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
+                const file_bin_digest = b: {
+                    if (digest_str.len != hex_digest_len) return error.InvalidFormat;
+                    var bd: BinDigest = undefined;
+                    _ = fmt.hexToBytes(&bd, digest_str) catch return error.InvalidFormat;
+                    break :b bd;
+                };
+
                 const prefix = fmt.parseInt(u8, prefix_str, 10) catch return error.InvalidFormat;
                 if (prefix >= self.cache.prefixes_len) return error.InvalidFormat;
 
-                if (file_path.len == 0) {
-                    return error.InvalidFormat;
-                }
-                if (cache_hash_file.prefixed_path) |pp| {
-                    if (pp.prefix != prefix or !mem.eql(u8, file_path, pp.sub_path)) {
-                        return error.InvalidFormat;
-                    }
-                }
+                if (file_path.len == 0) return error.InvalidFormat;
 
-                if (cache_hash_file.prefixed_path == null) {
-                    cache_hash_file.prefixed_path = .{
+                const cache_hash_file = f: {
+                    const prefixed_path: PrefixedPath = .{
                         .prefix = prefix,
-                        .sub_path = try gpa.dupe(u8, file_path),
+                        .sub_path = file_path, // expires with file_contents
                     };
-                }
+                    if (idx < input_file_count) {
+                        const file = &self.files.keys()[idx];
+                        if (!file.prefixed_path.eql(prefixed_path))
+                            return error.InvalidFormat;
+
+                        file.stat = .{
+                            .size = stat_size,
+                            .inode = stat_inode,
+                            .mtime = stat_mtime,
+                        };
+                        file.bin_digest = file_bin_digest;
+                        break :f file;
+                    }
+                    const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+                    errdefer assert(self.files.popOrNull() != null);
+                    if (!gop.found_existing) {
+                        gop.key_ptr.* = .{
+                            .prefixed_path = .{
+                                .prefix = prefix,
+                                .sub_path = try gpa.dupe(u8, file_path),
+                            },
+                            .contents = null,
+                            .max_file_size = null,
+                            .stat = .{
+                                .size = stat_size,
+                                .inode = stat_inode,
+                                .mtime = stat_mtime,
+                            },
+                            .bin_digest = file_bin_digest,
+                        };
+                    }
+                    break :f gop.key_ptr;
+                };
 
-                const pp = cache_hash_file.prefixed_path.?;
+                const pp = cache_hash_file.prefixed_path;
                 const dir = self.cache.prefixes()[pp.prefix].handle;
                 const this_file = dir.openFile(pp.sub_path, .{ .mode = .read_only }) catch |err| switch (err) {
                     error.FileNotFound => {
@@ -617,7 +610,7 @@ pub const Manifest = struct {
                 if (try self.upgradeToExclusiveLock()) continue;
                 self.manifest_dirty = true;
                 while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.items[idx];
+                    const ch_file = &self.files.keys()[idx];
                     self.populateFileHash(ch_file) catch |err| {
                         self.failed_file_index = idx;
                         return err;
@@ -640,12 +633,12 @@ pub const Manifest = struct {
         self.hash.hasher.update(&bin_digest);
 
         // Remove files not in the initial hash.
-        for (self.files.items[input_file_count..]) |*file| {
+        for (self.files.keys()[input_file_count..]) |*file| {
             file.deinit(self.cache.gpa);
         }
         self.files.shrinkRetainingCapacity(input_file_count);
 
-        for (self.files.items) |file| {
+        for (self.files.keys()) |file| {
             self.hash.hasher.update(&file.bin_digest);
         }
     }
@@ -685,7 +678,7 @@ pub const Manifest = struct {
     }
 
     fn populateFileHash(self: *Manifest, ch_file: *File) !void {
-        const pp = ch_file.prefixed_path.?;
+        const pp = ch_file.prefixed_path;
         const dir = self.cache.prefixes()[pp.prefix].handle;
         const file = try dir.openFile(pp.sub_path, .{});
         defer file.close();
@@ -751,7 +744,7 @@ pub const Manifest = struct {
             .bin_digest = undefined,
             .contents = null,
         };
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
+        errdefer self.files.shrinkRetainingCapacity(self.files.entries.len - 1);
 
         try self.populateFileHash(new_ch_file);
 
@@ -759,9 +752,11 @@ pub const Manifest = struct {
     }
 
     /// Add a file as a dependency of process being cached, after the initial hash has been
-    /// calculated. This is useful for processes that don't know the all the files that
-    /// are depended on ahead of time. For example, a source file that can import other files
-    /// will need to be recompiled if the imported file is changed.
+    /// calculated.
+    ///
+    /// This is useful for processes that don't know the all the files that are
+    /// depended on ahead of time. For example, a source file that can import
+    /// other files will need to be recompiled if the imported file is changed.
     pub fn addFilePost(self: *Manifest, file_path: []const u8) !void {
         assert(self.manifest_file != null);
 
@@ -769,17 +764,26 @@ pub const Manifest = struct {
         const prefixed_path = try self.cache.findPrefix(file_path);
         errdefer gpa.free(prefixed_path.sub_path);
 
-        const new_ch_file = try self.files.addOne(gpa);
-        new_ch_file.* = .{
+        const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+        errdefer assert(self.files.popOrNull() != null);
+
+        if (gop.found_existing) {
+            gpa.free(prefixed_path.sub_path);
+            return;
+        }
+
+        gop.key_ptr.* = .{
             .prefixed_path = prefixed_path,
             .max_file_size = null,
             .stat = undefined,
             .bin_digest = undefined,
             .contents = null,
         };
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
 
-        try self.populateFileHash(new_ch_file);
+        self.files.lockPointers();
+        defer self.files.unlockPointers();
+
+        try self.populateFileHash(gop.key_ptr);
     }
 
     /// Like `addFilePost` but when the file contents have already been loaded from disk.
@@ -793,13 +797,20 @@ pub const Manifest = struct {
         assert(self.manifest_file != null);
         const gpa = self.cache.gpa;
 
-        const ch_file = try self.files.addOne(gpa);
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
-
         const prefixed_path = try self.cache.findPrefixResolved(resolved_path);
         errdefer gpa.free(prefixed_path.sub_path);
 
-        ch_file.* = .{
+        const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+        errdefer assert(self.files.popOrNull() != null);
+
+        if (gop.found_existing) {
+            gpa.free(prefixed_path.sub_path);
+            return;
+        }
+
+        const new_file = gop.key_ptr;
+
+        new_file.* = .{
             .prefixed_path = prefixed_path,
             .max_file_size = null,
             .stat = stat,
@@ -807,19 +818,19 @@ pub const Manifest = struct {
             .contents = null,
         };
 
-        if (self.isProblematicTimestamp(ch_file.stat.mtime)) {
+        if (self.isProblematicTimestamp(new_file.stat.mtime)) {
             // The actual file has an unreliable timestamp, force it to be hashed
-            ch_file.stat.mtime = 0;
-            ch_file.stat.inode = 0;
+            new_file.stat.mtime = 0;
+            new_file.stat.inode = 0;
         }
 
         {
             var hasher = hasher_init;
             hasher.update(bytes);
-            hasher.final(&ch_file.bin_digest);
+            hasher.final(&new_file.bin_digest);
         }
 
-        self.hash.hasher.update(&ch_file.bin_digest);
+        self.hash.hasher.update(&new_file.bin_digest);
     }
 
     pub fn addDepFilePost(self: *Manifest, dir: fs.Dir, dep_file_basename: []const u8) !void {
@@ -885,14 +896,14 @@ pub const Manifest = struct {
 
             const writer = contents.writer();
             try writer.writeAll(manifest_header ++ "\n");
-            for (self.files.items) |file| {
+            for (self.files.keys()) |file| {
                 try writer.print("{d} {d} {d} {} {d} {s}\n", .{
                     file.stat.size,
                     file.stat.inode,
                     file.stat.mtime,
                     fmt.fmtSliceHexLower(&file.bin_digest),
-                    file.prefixed_path.?.prefix,
-                    file.prefixed_path.?.sub_path,
+                    file.prefixed_path.prefix,
+                    file.prefixed_path.sub_path,
                 });
             }
 
@@ -961,7 +972,7 @@ pub const Manifest = struct {
 
             file.close();
         }
-        for (self.files.items) |*file| {
+        for (self.files.keys()) |*file| {
             file.deinit(self.cache.gpa);
         }
         self.files.deinit(self.cache.gpa);
@@ -1130,7 +1141,7 @@ test "check that changing a file makes cache fail" {
             // There should be nothing in the cache
             try testing.expectEqual(false, try ch.hit());
 
-            try testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.items[temp_file_idx].contents.?));
+            try testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.keys()[temp_file_idx].contents.?));
 
             digest1 = ch.final();
 
@@ -1150,7 +1161,7 @@ test "check that changing a file makes cache fail" {
             try testing.expectEqual(false, try ch.hit());
 
             // The cache system does not keep the contents of re-hashed input files.
-            try testing.expect(ch.files.items[temp_file_idx].contents == null);
+            try testing.expect(ch.files.keys()[temp_file_idx].contents == null);
 
             digest2 = ch.final();
author	Andrew Kelley <andrew@ziglang.org>	2024-03-22 01:13:43 -0700
committer	GitHub <noreply@github.com>	2024-03-22 01:13:43 -0700
commit	a2651cbc829d44df4c3773037598b30e8cf0c4da (patch)
tree	555c74b10683ae9678c68777310116f47142a8aa /lib/std/Build/Cache.zig
parent	54c08579e4859673391843182aa2fd44aabbf6cf (diff)
parent	950359071bca707dbc9763f1bf3ebc79cd52ebca (diff)
download	zig-a2651cbc829d44df4c3773037598b30e8cf0c4da.tar.gz zig-a2651cbc829d44df4c3773037598b30e8cf0c4da.zip