diff options
| author | Frank Denis <124872+jedisct1@users.noreply.github.com> | 2025-11-03 17:09:00 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-11-03 17:09:00 +0100 |
| commit | ee4df4ad3edad160fb737a1935cd86bc2f9cfbbe (patch) | |
| tree | 88a30b506f3d620a74adaf2b01dbb6f2c17d43c7 /lib/std/crypto | |
| parent | afdd04356cbf7821e1f992d1f3aedf7226a7e564 (diff) | |
| download | zig-ee4df4ad3edad160fb737a1935cd86bc2f9cfbbe.tar.gz zig-ee4df4ad3edad160fb737a1935cd86bc2f9cfbbe.zip | |
crypto - threaded K12: separate context computation from thread spawning (#25793)
* threaded K12: separate context computation from thread spawning
Compute all contexts and store them in a pre-allocated array,
then spawn threads using the pre-computed contexts.
This ensures each context is fully materialized in memory with the
correct values before any thread tries to access it.
* kt128: unroll the permutation rounds only twice
This appears to deliver the best performance thanks to improved cache
utilization, and it’s consistent with what we already do for SHA3.
Diffstat (limited to 'lib/std/crypto')
| -rw-r--r-- | lib/std/crypto/kangarootwelve.zig | 189 |
1 files changed, 100 insertions, 89 deletions
diff --git a/lib/std/crypto/kangarootwelve.zig b/lib/std/crypto/kangarootwelve.zig index 7a3ed0f30a..5cad301ed4 100644 --- a/lib/std/crypto/kangarootwelve.zig +++ b/lib/std/crypto/kangarootwelve.zig @@ -230,58 +230,61 @@ fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void { break :blk offsets; }; - inline for (RC) |rc| { - // θ (theta) - var C: [5]@Vector(N, u64) = undefined; - inline for (0..5) |x| { - C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4]; - } + var round: usize = 0; + while (round < 12) : (round += 2) { + inline for (0..2) |i| { + // θ (theta) + var C: [5]@Vector(N, u64) = undefined; + inline for (0..5) |x| { + C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4]; + } - var D: [5]@Vector(N, u64) = undefined; - inline for (0..5) |x| { - D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1); - } + var D: [5]@Vector(N, u64) = undefined; + inline for (0..5) |x| { + D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1); + } - // Apply D to all lanes - inline for (0..5) |x| { - states[x][0] ^= D[x]; - states[x][1] ^= D[x]; - states[x][2] ^= D[x]; - states[x][3] ^= D[x]; - states[x][4] ^= D[x]; - } + // Apply D to all lanes + inline for (0..5) |x| { + states[x][0] ^= D[x]; + states[x][1] ^= D[x]; + states[x][2] ^= D[x]; + states[x][3] ^= D[x]; + states[x][4] ^= D[x]; + } - // ρ (rho) and π (pi) - optimized with pre-computed offsets - var current = states[1][0]; - var px: usize = 1; - var py: usize = 0; - inline for (rho_offsets) |rot| { - const next_y = (2 * px + 3 * py) % 5; - const next = states[py][next_y]; - states[py][next_y] = rol64Vec(N, current, rot); - current = next; - px = py; - py = next_y; - } + // ρ (rho) and π (pi) - optimized with pre-computed offsets + var current = states[1][0]; + var px: usize = 1; + var py: usize = 0; + inline for (rho_offsets) |rot| { + const next_y = (2 * px + 3 * py) % 5; + const next = states[py][next_y]; + states[py][next_y] = rol64Vec(N, current, rot); + current = next; + px = py; + py = next_y; + } - // χ (chi) - optimized with better register usage - inline for (0..5) |y| { - const t0 = states[0][y]; - const t1 = states[1][y]; - const t2 = states[2][y]; - const t3 = states[3][y]; - const t4 = states[4][y]; - - states[0][y] = t0 ^ (~t1 & t2); - states[1][y] = t1 ^ (~t2 & t3); - states[2][y] = t2 ^ (~t3 & t4); - states[3][y] = t3 ^ (~t4 & t0); - states[4][y] = t4 ^ (~t0 & t1); - } + // χ (chi) - optimized with better register usage + inline for (0..5) |y| { + const t0 = states[0][y]; + const t1 = states[1][y]; + const t2 = states[2][y]; + const t3 = states[3][y]; + const t4 = states[4][y]; + + states[0][y] = t0 ^ (~t1 & t2); + states[1][y] = t1 ^ (~t2 & t3); + states[2][y] = t2 ^ (~t3 & t4); + states[3][y] = t3 ^ (~t4 & t0); + states[4][y] = t4 ^ (~t0 & t1); + } - // ι (iota) - const rc_splat: @Vector(N, u64) = @splat(rc); - states[0][0] ^= rc_splat; + // ι (iota) + const rc_splat: @Vector(N, u64) = @splat(RC[round + i]); + states[0][0] ^= rc_splat; + } } } @@ -323,46 +326,49 @@ fn keccakP(state: *[200]u8) void { } // Apply 12 rounds - inline for (RC) |rc| { - // θ - var C: [5]u64 = undefined; - inline for (0..5) |x| { - C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4]; - } - var D: [5]u64 = undefined; - inline for (0..5) |x| { - D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1); - } - inline for (0..5) |x| { - inline for (0..5) |y| { - lanes[x][y] ^= D[x]; + var round: usize = 0; + while (round < 12) : (round += 2) { + inline for (0..2) |i| { + // θ + var C: [5]u64 = undefined; + inline for (0..5) |x| { + C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4]; + } + var D: [5]u64 = undefined; + inline for (0..5) |x| { + D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1); + } + inline for (0..5) |x| { + inline for (0..5) |y| { + lanes[x][y] ^= D[x]; + } } - } - // ρ and π - var current = lanes[1][0]; - var px: usize = 1; - var py: usize = 0; - inline for (0..24) |t| { - const temp = lanes[py][(2 * px + 3 * py) % 5]; - const rot_amount = ((t + 1) * (t + 2) / 2) % 64; - lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount))); - current = temp; - const temp_x = py; - py = (2 * px + 3 * py) % 5; - px = temp_x; - } + // ρ and π + var current = lanes[1][0]; + var px: usize = 1; + var py: usize = 0; + inline for (0..24) |t| { + const temp = lanes[py][(2 * px + 3 * py) % 5]; + const rot_amount = ((t + 1) * (t + 2) / 2) % 64; + lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount))); + current = temp; + const temp_x = py; + py = (2 * px + 3 * py) % 5; + px = temp_x; + } - // χ - inline for (0..5) |y| { - const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] }; - inline for (0..5) |x| { - lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]); + // χ + inline for (0..5) |y| { + const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] }; + inline for (0..5) |x| { + lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]); + } } - } - // ι - lanes[0][0] ^= rc; + // ι + lanes[0][0] ^= RC[round + i]; + } } // Store lanes back to state @@ -759,32 +765,37 @@ fn ktMultiThreaded( const all_scratch = try allocator.alloc(u8, thread_count * scratch_size); defer allocator.free(all_scratch); - var group: Io.Group = .init; + const contexts = try allocator.alloc(LeafBatchContext, thread_count); + defer allocator.free(contexts); + var leaves_assigned: usize = 0; - var thread_idx: usize = 0; + var context_count: usize = 0; while (leaves_assigned < total_leaves) { const batch_count = @min(leaves_per_thread, total_leaves - leaves_assigned); const batch_start = chunk_size + leaves_assigned * chunk_size; const cvs_offset = leaves_assigned * cv_size; - const ctx = LeafBatchContext{ + contexts[context_count] = LeafBatchContext{ .output_cvs = cvs[cvs_offset .. cvs_offset + batch_count * cv_size], .batch_start = batch_start, .batch_count = batch_count, .view = view, - .scratch_buffer = all_scratch[thread_idx * scratch_size .. (thread_idx + 1) * scratch_size], + .scratch_buffer = all_scratch[context_count * scratch_size .. (context_count + 1) * scratch_size], .total_len = total_len, }; + leaves_assigned += batch_count; + context_count += 1; + } + + var group: Io.Group = .init; + for (contexts[0..context_count]) |ctx| { group.async(io, struct { fn process(c: LeafBatchContext) void { processLeafBatch(Variant, c); } }.process, .{ctx}); - - leaves_assigned += batch_count; - thread_idx += 1; } // Wait for all threads to complete |
