Xxhash perf (#15947)

Improvements for xxHash performance, both on small keys as well as large slices. * std.hash: make benchmarks consistent for xxhash There is some odd behaviour in the timings for the XXHash benchmarks introduced in 879f0b9, specifically the changes to the allocation in benchmarkHash. The problem is somewhere in the difference between 9628243 and 9362d61 (these are commit that were force-pushed over but can be found in PR #15917). * std.hash: correctly calculate throughput in benchmark * std.hash: add hashes per sec to small key output * std.hash: add exact and small xxhash routines * std.hash: add --small-only flag to benchmark * std.hash.xxhash: extract stateless Accumulator type * std.hash.xxhash: cleanup hash() and improve small key perf * std.hash.xxhash: port xxhash64 changes to xxhash32 * std.hash: change benchmark --small-only flag to --include-array
author: Dominic <4678790+dweiller@users.noreply.github.com> 2023-07-25 03:47:45 +1000
committer: GitHub <noreply@github.com> 2023-07-24 13:47:45 -0400
commit: 559150e8440d288adfaeb84c8bc3ec400605287d (patch)
tree: cd786a3360a780067c1f12d9ec84beca49f087b7 /lib/std/hash/benchmark.zig
parent: d82b35901035a325ca7afd38b28ff2386f90ae84 (diff)
download: zig-559150e8440d288adfaeb84c8bc3ec400605287d.tar.gz
zig-559150e8440d288adfaeb84c8bc3ec400605287d.zip
1 files changed, 224 insertions, 19 deletions
diff --git a/lib/std/hash/benchmark.zig b/lib/std/hash/benchmark.zig
index 699de5ceb4..322adeb61c 100644
--- a/lib/std/hash/benchmark.zig
+++ b/lib/std/hash/benchmark.zig
@@ -18,6 +18,7 @@ const Hash = struct {
     name: []const u8,
     has_iterative_api: bool = true,
     has_crypto_api: bool = false,
+    has_anytype_api: ?[]const comptime_int = null,
     init_u8s: ?[]const u8 = null,
     init_u64: ?u64 = null,
 };
@@ -27,11 +28,13 @@ const hashes = [_]Hash{
         .ty = hash.XxHash64,
         .name = "xxhash64",
         .init_u64 = 0,
+        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
     },
     Hash{
         .ty = hash.XxHash32,
         .name = "xxhash32",
         .init_u64 = 0,
+        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
     },
     Hash{
         .ty = hash.Wyhash,
@@ -99,14 +102,14 @@ const Result = struct {
 };
 
 const block_size: usize = 8 * 8192;
-const alignment: usize = 64;
 
 pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result {
-    const blocks_count = bytes / block_size;
-    var blocks = try allocator.alloc(u8, block_size + alignment * (blocks_count - 1));
+    var blocks = try allocator.alloc(u8, bytes);
     defer allocator.free(blocks);
     random.bytes(blocks);
 
+    const block_count = bytes / block_size;
+
     var h = blk: {
         if (H.init_u8s) |init| {
             break :blk H.ty.init(init[0..H.ty.key_length]);
@@ -118,17 +121,17 @@ pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Alloc
     };
 
     var timer = try Timer.start();
-    const start = timer.lap();
-    for (0..blocks_count) |i| {
-        h.update(blocks[i * alignment ..][0..block_size]);
+    for (0..block_count) |i| {
+        h.update(blocks[i * block_size ..][0..block_size]);
     }
     const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final();
     std.mem.doNotOptimizeAway(final);
 
-    const end = timer.read();
+    const elapsed_ns = timer.read();
 
-    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
-    const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s));
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const size_float: f64 = @floatFromInt(block_size * block_count);
+    const throughput: u64 = @intFromFloat(size_float / elapsed_s);
 
     return Result{
         .hash = final,
@@ -144,7 +147,6 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
     const key_count = bytes / key_size;
 
     var timer = try Timer.start();
-    const start = timer.lap();
 
     var sum: u64 = 0;
     for (0..key_count) |i| {
@@ -164,10 +166,59 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
         };
         sum +%= final;
     }
-    const end = timer.read();
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const size_float: f64 = @floatFromInt(key_count * key_size);
+    const throughput: u64 = @intFromFloat(size_float / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .hash = sum,
+        .throughput = throughput,
+    };
+}
+
+// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
+// if you see strange performance changes consider using `.never_inline` or `.always_inline`
+// to ensure the changes are not only due to the optimiser inlining the benchmark differently
+pub fn benchmarkHashSmallKeysArrayPtr(
+    comptime H: anytype,
+    comptime key_size: usize,
+    bytes: usize,
+    allocator: std.mem.Allocator,
+) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    for (0..key_count) |i| {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hash(init, small_key);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hash(init, small_key);
+            }
+            break :blk H.ty.hash(small_key);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
 
-    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
-    const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s));
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
 
     std.mem.doNotOptimizeAway(sum);
 
@@ -177,6 +228,95 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
     };
 }
 
+// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
+// if you see strange performance changes consider using `.never_inline` or `.always_inline`
+// to ensure the changes are not only due to the optimiser inlining the benchmark differently
+pub fn benchmarkHashSmallKeysArray(
+    comptime H: anytype,
+    comptime key_size: usize,
+    bytes: usize,
+    allocator: std.mem.Allocator,
+) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var i: usize = 0;
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    while (i < key_count) : (i += 1) {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hash(init, small_key.*);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hash(init, small_key.*);
+            }
+            break :blk H.ty.hash(small_key.*);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .hash = sum,
+        .throughput = throughput,
+    };
+}
+
+pub fn benchmarkHashSmallApi(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    for (0..key_count) |i| {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hashSmall(init, small_key);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hashSmall(init, small_key);
+            }
+            break :blk H.ty.hashSmall(small_key);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .throughput = throughput,
+        .hash = sum,
+    };
+}
+
 fn usage() void {
     std.debug.print(
         \\throughput_test [options]
@@ -205,9 +345,12 @@ pub fn main() !void {
 
     var filter: ?[]u8 = "";
     var count: usize = mode(128 * MiB);
-    var key_size: usize = 32;
+    var key_size: ?usize = null;
     var seed: u32 = 0;
     var test_iterative_only = false;
+    var test_arrays = false;
+
+    const default_small_key_size = 32;
 
     var i: usize = 1;
     while (i < args.len) : (i += 1) {
@@ -248,12 +391,14 @@ pub fn main() !void {
             }
 
             key_size = try std.fmt.parseUnsigned(usize, args[i], 10);
-            if (key_size > block_size) {
+            if (key_size.? > block_size) {
                 try stdout.print("key_size cannot exceed block size of {}\n", .{block_size});
                 std.os.exit(1);
             }
         } else if (std.mem.eql(u8, args[i], "--iterative-only")) {
             test_iterative_only = true;
+        } else if (std.mem.eql(u8, args[i], "--include-array")) {
+            test_arrays = true;
         } else if (std.mem.eql(u8, args[i], "--help")) {
             usage();
             return;
@@ -268,7 +413,7 @@ pub fn main() !void {
     const allocator = gpa.allocator();
 
     inline for (hashes) |H| {
-        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) {
+        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) hash: {
             if (!test_iterative_only or H.has_iterative_api) {
                 try stdout.print("{s}\n", .{H.name});
 
@@ -281,9 +426,69 @@ pub fn main() !void {
                 }
 
                 if (!test_iterative_only) {
-                    prng.seed(seed);
-                    const result_small = try benchmarkHashSmallKeys(H, key_size, count, allocator);
-                    try stdout.print("  small keys: {:5} MiB/s [{x:0<16}]\n", .{ result_small.throughput / (1 * MiB), result_small.hash });
+                    if (key_size) |size| {
+                        prng.seed(seed);
+                        const result_small = try benchmarkHashSmallKeys(H, size, count, allocator);
+                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
+                            size,
+                            result_small.throughput / (1 * MiB),
+                            result_small.throughput / size,
+                            result_small.hash,
+                        });
+
+                        if (!test_arrays) break :hash;
+                        if (H.has_anytype_api) |sizes| {
+                            inline for (sizes) |exact_size| {
+                                if (size == exact_size) {
+                                    prng.seed(seed);
+                                    const result_array = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
+                                    prng.seed(seed);
+                                    const result_ptr = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
+                                    try stdout.print("       array: {:5} MiB/s [{x:0<16}]\n", .{
+                                        result_array.throughput / (1 * MiB),
+                                        result_array.hash,
+                                    });
+                                    try stdout.print("   array ptr: {:5} MiB/s [{x:0<16}]\n", .{
+                                        result_ptr.throughput / (1 * MiB),
+                                        result_ptr.hash,
+                                    });
+                                }
+                            }
+                        }
+                    } else {
+                        prng.seed(seed);
+                        const result_small = try benchmarkHashSmallKeys(H, default_small_key_size, count, allocator);
+                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
+                            default_small_key_size,
+                            result_small.throughput / (1 * MiB),
+                            result_small.throughput / default_small_key_size,
+                            result_small.hash,
+                        });
+
+                        if (!test_arrays) break :hash;
+                        if (H.has_anytype_api) |sizes| {
+                            try stdout.print("       array:\n", .{});
+                            inline for (sizes) |exact_size| {
+                                prng.seed(seed);
+                                const result = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
+                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
+                                    exact_size,
+                                    result.throughput / (1 * MiB),
+                                    result.hash,
+                                });
+                            }
+                            try stdout.print("   array ptr: \n", .{});
+                            inline for (sizes) |exact_size| {
+                                prng.seed(seed);
+                                const result = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
+                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
+                                    exact_size,
+                                    result.throughput / (1 * MiB),
+                                    result.hash,
+                                });
+                            }
+                        }
+                    }
                 }
             }
         }
author	Dominic <4678790+dweiller@users.noreply.github.com>	2023-07-25 03:47:45 +1000
committer	GitHub <noreply@github.com>	2023-07-24 13:47:45 -0400
commit	559150e8440d288adfaeb84c8bc3ec400605287d (patch)
tree	cd786a3360a780067c1f12d9ec84beca49f087b7 /lib/std/hash/benchmark.zig
parent	d82b35901035a325ca7afd38b28ff2386f90ae84 (diff)
download	zig-559150e8440d288adfaeb84c8bc3ec400605287d.tar.gz zig-559150e8440d288adfaeb84c8bc3ec400605287d.zip