From 10edb6d352173dfbc9962ce3db064384319e77f8 Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Sat, 22 Oct 2022 23:50:38 -0700
Subject: crypto.sha2: Use intrinsics for SHA-256 on x86-64 and AArch64

There's probably plenty of room to optimize these further in the
future, but for the moment this gives ~3x improvement on Intel
x86-64 processors, ~5x on AMD, and ~10x on M1 Macs.

These extensions are very new - Most processors prior to 2020 do
not support them.

AVX-512 is a slightly older alternative that we could use on Intel
for a much bigger performance bump, but it's been fused off on
Intel's latest hybrid architectures and it relies on computing
independent SHA hashes in parallel. In contrast, these SHA intrinsics
provide the usual single-threaded, single-stream interface, and should
continue working on new processors.

AArch64 also has SHA-512 intrinsics that we could take advantage
of in the future
---
 lib/std/crypto/sha2.zig | 234 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 164 insertions(+), 70 deletions(-)

diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
index b7a78c4b44..d51c2e8cc7 100644
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@@ -1,4 +1,5 @@
 const std = @import("../std.zig");
+const builtin = @import("builtin");
 const mem = std.mem;
 const math = std.math;
 const htest = @import("test.zig");
@@ -16,10 +17,9 @@ const RoundParam256 = struct {
     g: usize,
     h: usize,
     i: usize,
-    k: u32,
 };
 
-fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize, h: usize, i: usize, k: u32) RoundParam256 {
+fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize, h: usize, i: usize) RoundParam256 {
     return RoundParam256{
         .a = a,
         .b = b,
@@ -30,7 +30,6 @@ fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g:
         .g = g,
         .h = h,
         .i = i,
-        .k = k,
     };
 }
 
@@ -70,6 +69,8 @@ const Sha256Params = Sha2Params32{
     .digest_bits = 256,
 };
 
+const v4u32 = @Vector(4, u32);
+
 /// SHA-224
 pub const Sha224 = Sha2x32(Sha224Params);
 
@@ -83,7 +84,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
         pub const digest_length = params.digest_bits / 8;
         pub const Options = struct {};
 
-        s: [8]u32,
+        s: [8]u32 align(16),
         // Streaming Cache
         buf: [64]u8 = undefined,
         buf_len: u8 = 0,
@@ -168,8 +169,19 @@ fn Sha2x32(comptime params: Sha2Params32) type {
             }
         }
 
+        const W = [64]u32{
+            0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+            0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+            0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+            0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+            0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+            0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+            0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+            0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
+        };
+
         fn round(d: *Self, b: *const [64]u8) void {
-            var s: [64]u32 = undefined;
+            var s: [64]u32 align(16) = undefined;
 
             var i: usize = 0;
             while (i < 16) : (i += 1) {
@@ -179,6 +191,88 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                 s[i] |= @as(u32, b[i * 4 + 2]) << 8;
                 s[i] |= @as(u32, b[i * 4 + 3]) << 0;
             }
+
+            if (builtin.cpu.arch == .aarch64 and builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
+                var x: v4u32 = d.s[0..4].*;
+                var y: v4u32 = d.s[4..8].*;
+                const s_v = @ptrCast(*[16]v4u32, &s);
+
+                comptime var k: u8 = 0;
+                inline while (k < 16) : (k += 1) {
+                    if (k > 3) {
+                        s_v[k] = asm (
+                            \\sha256su0.4s %[w0_3], %[w4_7]
+                            \\sha256su1.4s %[w0_3], %[w8_11], %[w12_15]
+                            : [w0_3] "=w" (-> v4u32),
+                            : [_] "0" (s_v[k - 4]),
+                              [w4_7] "w" (s_v[k - 3]),
+                              [w8_11] "w" (s_v[k - 2]),
+                              [w12_15] "w" (s_v[k - 1]),
+                        );
+                    }
+
+                    const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
+                    asm volatile (
+                        \\mov.4s v0, %[x]
+                        \\sha256h.4s %[x], %[y], %[w]
+                        \\sha256h2.4s %[y], v0, %[w]
+                        : [x] "=w" (x),
+                          [y] "=w" (y),
+                        : [_] "0" (x),
+                          [_] "1" (y),
+                          [w] "w" (w),
+                        : "v0"
+                    );
+                }
+
+                d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*);
+                d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
+                return;
+            } else if (builtin.cpu.arch == .x86_64 and builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
+                var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
+                var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
+                const s_v = @ptrCast(*[16]v4u32, &s);
+
+                comptime var k: u8 = 0;
+                inline while (k < 16) : (k += 1) {
+                    if (k < 12) {
+                        const r = asm ("sha256msg1 %[w4_7], %[w0_3]"
+                            : [w0_3] "=x" (-> v4u32),
+                            : [_] "0" (s_v[k]),
+                              [w4_7] "x" (s_v[k + 1]),
+                        );
+                        const t = @shuffle(u32, s_v[k + 2], s_v[k + 3], [_]i32{ 1, 2, 3, -1 });
+                        s_v[k + 4] = asm ("sha256msg2 %[w12_15], %[t]"
+                            : [t] "=x" (-> v4u32),
+                            : [_] "0" (r +% t),
+                              [w12_15] "x" (s_v[k + 3]),
+                        );
+                    }
+
+                    const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
+                    asm volatile (
+                        \\sha256rnds2 %[x], %[y]
+                        \\pshufd $0xe, %%xmm0, %%xmm0
+                        \\sha256rnds2 %[y], %[x]
+                        : [y] "=x" (y),
+                          [x] "=x" (x),
+                        : [_] "0" (y),
+                          [_] "1" (x),
+                          [_] "{xmm0}" (w),
+                    );
+                }
+
+                d.s[0] +%= x[3];
+                d.s[1] +%= x[2];
+                d.s[4] +%= x[1];
+                d.s[5] +%= x[0];
+                d.s[2] +%= y[3];
+                d.s[3] +%= y[2];
+                d.s[6] +%= y[1];
+                d.s[7] +%= y[0];
+                return;
+            }
+
             while (i < 64) : (i += 1) {
                 s[i] = s[i - 16] +% s[i - 7] +% (math.rotr(u32, s[i - 15], @as(u32, 7)) ^ math.rotr(u32, s[i - 15], @as(u32, 18)) ^ (s[i - 15] >> 3)) +% (math.rotr(u32, s[i - 2], @as(u32, 17)) ^ math.rotr(u32, s[i - 2], @as(u32, 19)) ^ (s[i - 2] >> 10));
             }
@@ -195,73 +289,73 @@ fn Sha2x32(comptime params: Sha2Params32) type {
             };
 
             const round0 = comptime [_]RoundParam256{
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 0, 0x428A2F98),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 1, 0x71374491),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 2, 0xB5C0FBCF),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 3, 0xE9B5DBA5),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 4, 0x3956C25B),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 5, 0x59F111F1),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 6, 0x923F82A4),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 7, 0xAB1C5ED5),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 8, 0xD807AA98),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 9, 0x12835B01),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 10, 0x243185BE),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 11, 0x550C7DC3),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 12, 0x72BE5D74),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 13, 0x80DEB1FE),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 14, 0x9BDC06A7),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 15, 0xC19BF174),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 16, 0xE49B69C1),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 17, 0xEFBE4786),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 18, 0x0FC19DC6),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 19, 0x240CA1CC),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 20, 0x2DE92C6F),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 21, 0x4A7484AA),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 22, 0x5CB0A9DC),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 23, 0x76F988DA),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 24, 0x983E5152),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 25, 0xA831C66D),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 26, 0xB00327C8),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 27, 0xBF597FC7),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 28, 0xC6E00BF3),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 29, 0xD5A79147),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 30, 0x06CA6351),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 31, 0x14292967),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 32, 0x27B70A85),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 33, 0x2E1B2138),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 34, 0x4D2C6DFC),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 35, 0x53380D13),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 36, 0x650A7354),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 37, 0x766A0ABB),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 38, 0x81C2C92E),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 39, 0x92722C85),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 40, 0xA2BFE8A1),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 41, 0xA81A664B),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 42, 0xC24B8B70),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 43, 0xC76C51A3),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 44, 0xD192E819),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 45, 0xD6990624),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 46, 0xF40E3585),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 47, 0x106AA070),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 48, 0x19A4C116),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 49, 0x1E376C08),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 50, 0x2748774C),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 51, 0x34B0BCB5),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 52, 0x391C0CB3),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 53, 0x4ED8AA4A),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 54, 0x5B9CCA4F),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 55, 0x682E6FF3),
-                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 56, 0x748F82EE),
-                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 57, 0x78A5636F),
-                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 58, 0x84C87814),
-                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 59, 0x8CC70208),
-                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 60, 0x90BEFFFA),
-                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 61, 0xA4506CEB),
-                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 62, 0xBEF9A3F7),
-                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 63, 0xC67178F2),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 0),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 1),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 2),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 3),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 4),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 5),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 6),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 7),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 8),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 9),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 10),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 11),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 12),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 13),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 14),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 15),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 16),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 17),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 18),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 19),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 20),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 21),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 22),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 23),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 24),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 25),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 26),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 27),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 28),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 29),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 30),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 31),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 32),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 33),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 34),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 35),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 36),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 37),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 38),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 39),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 40),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 41),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 42),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 43),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 44),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 45),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 46),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 47),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 48),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 49),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 50),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 51),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 52),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 53),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 54),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 55),
+                roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 56),
+                roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 57),
+                roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 58),
+                roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 59),
+                roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 60),
+                roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 61),
+                roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 62),
+                roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 63),
             };
             inline for (round0) |r| {
-                v[r.h] = v[r.h] +% (math.rotr(u32, v[r.e], @as(u32, 6)) ^ math.rotr(u32, v[r.e], @as(u32, 11)) ^ math.rotr(u32, v[r.e], @as(u32, 25))) +% (v[r.g] ^ (v[r.e] & (v[r.f] ^ v[r.g]))) +% r.k +% s[r.i];
+                v[r.h] = v[r.h] +% (math.rotr(u32, v[r.e], @as(u32, 6)) ^ math.rotr(u32, v[r.e], @as(u32, 11)) ^ math.rotr(u32, v[r.e], @as(u32, 25))) +% (v[r.g] ^ (v[r.e] & (v[r.f] ^ v[r.g]))) +% W[r.i] +% s[r.i];
 
                 v[r.d] = v[r.d] +% v[r.h];
 
-- 
cgit v1.2.3


From ee241c47ee675050e4e4b0eabd6ba06a82cc626e Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Mon, 24 Oct 2022 00:38:10 -0700
Subject: std.crypto: SHA-256 Properly gate comptime conditional

This feature detection must be done at comptime so that we avoid
generating invalid ASM for the target.
---
 lib/std/crypto/sha2.zig | 150 +++++++++++++++++++++++++-----------------------
 1 file changed, 77 insertions(+), 73 deletions(-)

diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
index d51c2e8cc7..5b2c8a89e1 100644
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@@ -192,85 +192,89 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                 s[i] |= @as(u32, b[i * 4 + 3]) << 0;
             }
 
-            if (builtin.cpu.arch == .aarch64 and builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
-                var x: v4u32 = d.s[0..4].*;
-                var y: v4u32 = d.s[4..8].*;
-                const s_v = @ptrCast(*[16]v4u32, &s);
-
-                comptime var k: u8 = 0;
-                inline while (k < 16) : (k += 1) {
-                    if (k > 3) {
-                        s_v[k] = asm (
-                            \\sha256su0.4s %[w0_3], %[w4_7]
-                            \\sha256su1.4s %[w0_3], %[w8_11], %[w12_15]
-                            : [w0_3] "=w" (-> v4u32),
-                            : [_] "0" (s_v[k - 4]),
-                              [w4_7] "w" (s_v[k - 3]),
-                              [w8_11] "w" (s_v[k - 2]),
-                              [w12_15] "w" (s_v[k - 1]),
+            switch (builtin.cpu.arch) {
+                .aarch64 => if (comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
+                    var x: v4u32 = d.s[0..4].*;
+                    var y: v4u32 = d.s[4..8].*;
+                    const s_v = @ptrCast(*[16]v4u32, &s);
+
+                    comptime var k: u8 = 0;
+                    inline while (k < 16) : (k += 1) {
+                        if (k > 3) {
+                            s_v[k] = asm (
+                                \\sha256su0.4s %[w0_3], %[w4_7]
+                                \\sha256su1.4s %[w0_3], %[w8_11], %[w12_15]
+                                : [w0_3] "=w" (-> v4u32),
+                                : [_] "0" (s_v[k - 4]),
+                                  [w4_7] "w" (s_v[k - 3]),
+                                  [w8_11] "w" (s_v[k - 2]),
+                                  [w12_15] "w" (s_v[k - 1]),
+                            );
+                        }
+
+                        const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
+                        asm volatile (
+                            \\mov.4s v0, %[x]
+                            \\sha256h.4s %[x], %[y], %[w]
+                            \\sha256h2.4s %[y], v0, %[w]
+                            : [x] "=w" (x),
+                              [y] "=w" (y),
+                            : [_] "0" (x),
+                              [_] "1" (y),
+                              [w] "w" (w),
+                            : "v0"
                         );
                     }
 
-                    const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
-                    asm volatile (
-                        \\mov.4s v0, %[x]
-                        \\sha256h.4s %[x], %[y], %[w]
-                        \\sha256h2.4s %[y], v0, %[w]
-                        : [x] "=w" (x),
-                          [y] "=w" (y),
-                        : [_] "0" (x),
-                          [_] "1" (y),
-                          [w] "w" (w),
-                        : "v0"
-                    );
-                }
-
-                d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*);
-                d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
-                return;
-            } else if (builtin.cpu.arch == .x86_64 and builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
-                var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
-                var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
-                const s_v = @ptrCast(*[16]v4u32, &s);
-
-                comptime var k: u8 = 0;
-                inline while (k < 16) : (k += 1) {
-                    if (k < 12) {
-                        const r = asm ("sha256msg1 %[w4_7], %[w0_3]"
-                            : [w0_3] "=x" (-> v4u32),
-                            : [_] "0" (s_v[k]),
-                              [w4_7] "x" (s_v[k + 1]),
-                        );
-                        const t = @shuffle(u32, s_v[k + 2], s_v[k + 3], [_]i32{ 1, 2, 3, -1 });
-                        s_v[k + 4] = asm ("sha256msg2 %[w12_15], %[t]"
-                            : [t] "=x" (-> v4u32),
-                            : [_] "0" (r +% t),
-                              [w12_15] "x" (s_v[k + 3]),
+                    d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*);
+                    d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
+                    return;
+                },
+                .x86_64 => if (comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
+                    var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
+                    var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
+                    const s_v = @ptrCast(*[16]v4u32, &s);
+
+                    comptime var k: u8 = 0;
+                    inline while (k < 16) : (k += 1) {
+                        if (k < 12) {
+                            const r = asm ("sha256msg1 %[w4_7], %[w0_3]"
+                                : [w0_3] "=x" (-> v4u32),
+                                : [_] "0" (s_v[k]),
+                                  [w4_7] "x" (s_v[k + 1]),
+                            );
+                            const t = @shuffle(u32, s_v[k + 2], s_v[k + 3], [_]i32{ 1, 2, 3, -1 });
+                            s_v[k + 4] = asm ("sha256msg2 %[w12_15], %[t]"
+                                : [t] "=x" (-> v4u32),
+                                : [_] "0" (r +% t),
+                                  [w12_15] "x" (s_v[k + 3]),
+                            );
+                        }
+
+                        const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
+                        asm volatile (
+                            \\sha256rnds2 %[x], %[y]
+                            \\pshufd $0xe, %%xmm0, %%xmm0
+                            \\sha256rnds2 %[y], %[x]
+                            : [y] "=x" (y),
+                              [x] "=x" (x),
+                            : [_] "0" (y),
+                              [_] "1" (x),
+                              [_] "{xmm0}" (w),
                         );
                     }
 
-                    const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
-                    asm volatile (
-                        \\sha256rnds2 %[x], %[y]
-                        \\pshufd $0xe, %%xmm0, %%xmm0
-                        \\sha256rnds2 %[y], %[x]
-                        : [y] "=x" (y),
-                          [x] "=x" (x),
-                        : [_] "0" (y),
-                          [_] "1" (x),
-                          [_] "{xmm0}" (w),
-                    );
-                }
-
-                d.s[0] +%= x[3];
-                d.s[1] +%= x[2];
-                d.s[4] +%= x[1];
-                d.s[5] +%= x[0];
-                d.s[2] +%= y[3];
-                d.s[3] +%= y[2];
-                d.s[6] +%= y[1];
-                d.s[7] +%= y[0];
-                return;
+                    d.s[0] +%= x[3];
+                    d.s[1] +%= x[2];
+                    d.s[4] +%= x[1];
+                    d.s[5] +%= x[0];
+                    d.s[2] +%= y[3];
+                    d.s[3] +%= y[2];
+                    d.s[6] +%= y[1];
+                    d.s[7] +%= y[0];
+                    return;
+                },
+                else => {},
             }
 
             while (i < 64) : (i += 1) {
-- 
cgit v1.2.3


From 4c1f71e866088a1a2e943331256115ed7e3daf98 Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Mon, 24 Oct 2022 09:47:31 -0700
Subject: std.crypto: Optimize SHA-256 intrinsics for AMD x86-64

This gets us most of the way back to the performance I had when
I was using the LLVM intrinsics:
  - Intel Intel(R) Core(TM) i7-1068NG7 CPU @ 2.30GHz:
       190.67 MB/s (w/o intrinsics) -> 1285.08 MB/s
  - AMD EPYC 7763 (VM) @ 2.45 GHz:
       240.09 MB/s (w/o intrinsics) -> 1360.78 MB/s
  - Apple M1:
       216.96 MB/s (w/o intrinsics) -> 2133.69 MB/s

Minor changes to this source can swing performance from 400 MB/s to
1400 MB/s or... 20 MB/s, depending on how it interacts with the
optimizer. I have a sneaking suspicion that despite LLVM inheriting
GCC's extremely strict inline assembly semantics, its passes are
rather skittish around inline assembly (and almost certainly, its
instruction cost models can assume nothing)
---
 lib/std/crypto/sha2.zig | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
index 5b2c8a89e1..5380f32f0b 100644
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@@ -182,14 +182,8 @@ fn Sha2x32(comptime params: Sha2Params32) type {
 
         fn round(d: *Self, b: *const [64]u8) void {
             var s: [64]u32 align(16) = undefined;
-
-            var i: usize = 0;
-            while (i < 16) : (i += 1) {
-                s[i] = 0;
-                s[i] |= @as(u32, b[i * 4 + 0]) << 24;
-                s[i] |= @as(u32, b[i * 4 + 1]) << 16;
-                s[i] |= @as(u32, b[i * 4 + 2]) << 8;
-                s[i] |= @as(u32, b[i * 4 + 3]) << 0;
+            for (@ptrCast(*align(1) const [16]u32, b)) |*elem, i| {
+                s[i] = mem.readIntBig(u32, mem.asBytes(elem));
             }
 
             switch (builtin.cpu.arch) {
@@ -238,30 +232,35 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                     comptime var k: u8 = 0;
                     inline while (k < 16) : (k += 1) {
                         if (k < 12) {
-                            const r = asm ("sha256msg1 %[w4_7], %[w0_3]"
-                                : [w0_3] "=x" (-> v4u32),
-                                : [_] "0" (s_v[k]),
+                            var tmp = s_v[k];
+                            s_v[k + 4] = asm (
+                                \\ sha256msg1 %[w4_7], %[tmp]
+                                \\ vpalignr $0x4, %[w8_11], %[w12_15], %[result]
+                                \\ paddd %[tmp], %[result]
+                                \\ sha256msg2 %[w12_15], %[result]
+                                : [tmp] "=&x" (tmp),
+                                  [result] "=&x" (-> v4u32),
+                                : [_] "0" (tmp),
                                   [w4_7] "x" (s_v[k + 1]),
-                            );
-                            const t = @shuffle(u32, s_v[k + 2], s_v[k + 3], [_]i32{ 1, 2, 3, -1 });
-                            s_v[k + 4] = asm ("sha256msg2 %[w12_15], %[t]"
-                                : [t] "=x" (-> v4u32),
-                                : [_] "0" (r +% t),
+                                  [w8_11] "x" (s_v[k + 2]),
                                   [w12_15] "x" (s_v[k + 3]),
                             );
                         }
 
                         const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
-                        asm volatile (
-                            \\sha256rnds2 %[x], %[y]
-                            \\pshufd $0xe, %%xmm0, %%xmm0
-                            \\sha256rnds2 %[y], %[x]
-                            : [y] "=x" (y),
-                              [x] "=x" (x),
+                        y = asm ("sha256rnds2 %[x], %[y]"
+                            : [y] "=x" (-> v4u32),
                             : [_] "0" (y),
-                              [_] "1" (x),
+                              [x] "x" (x),
                               [_] "{xmm0}" (w),
                         );
+
+                        x = asm ("sha256rnds2 %[y], %[x]"
+                            : [x] "=x" (-> v4u32),
+                            : [_] "0" (x),
+                              [y] "x" (y),
+                              [_] "{xmm0}" (@bitCast(v4u32, @bitCast(u128, w) >> 64)),
+                        );
                     }
 
                     d.s[0] +%= x[3];
@@ -277,6 +276,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                 else => {},
             }
 
+            var i: usize = 16;
             while (i < 64) : (i += 1) {
                 s[i] = s[i - 16] +% s[i - 7] +% (math.rotr(u32, s[i - 15], @as(u32, 7)) ^ math.rotr(u32, s[i - 15], @as(u32, 18)) ^ (s[i - 15] >> 3)) +% (math.rotr(u32, s[i - 2], @as(u32, 17)) ^ math.rotr(u32, s[i - 2], @as(u32, 19)) ^ (s[i - 2] >> 10));
             }
-- 
cgit v1.2.3


From f9fe548e41a41e3edcff4d30f495246d0fee145b Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Fri, 28 Oct 2022 08:24:12 -0700
Subject: std.crypto: Add `isComptime` guard around intrinsics

Comptime code can't execute assembly code, so we need some way to
force comptime code to use the generic path. This should be replaced
with whatever is implemented for #868, when that day comes.

I am seeing that the result for the hash is incorrect in stage1 and
crashes stage2, so presumably this never worked correctly. I will follow
up on that soon.
---
 lib/std/crypto/sha2.zig | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
index 5380f32f0b..510c22b14f 100644
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@@ -71,6 +71,12 @@ const Sha256Params = Sha2Params32{
 
 const v4u32 = @Vector(4, u32);
 
+// TODO: Remove once https://github.com/ziglang/zig/issues/868 is resolved.
+fn isComptime() bool {
+    var a: u8 = 0;
+    return @typeInfo(@TypeOf(.{a})).Struct.fields[0].is_comptime;
+}
+
 /// SHA-224
 pub const Sha224 = Sha2x32(Sha224Params);
 
@@ -187,7 +193,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
             }
 
             switch (builtin.cpu.arch) {
-                .aarch64 => if (comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
+                .aarch64 => if (!isComptime() and comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
                     var x: v4u32 = d.s[0..4].*;
                     var y: v4u32 = d.s[4..8].*;
                     const s_v = @ptrCast(*[16]v4u32, &s);
@@ -224,7 +230,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                     d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
                     return;
                 },
-                .x86_64 => if (comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
+                .x86_64 => if (!isComptime() and comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
                     var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
                     var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
                     const s_v = @ptrCast(*[16]v4u32, &s);
-- 
cgit v1.2.3


From 67fa3262b1329316cbf62e00ba3890d68a9f5f6d Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Fri, 28 Oct 2022 17:17:08 -0700
Subject: std.crypto: Use `featureSetHas` to gate intrinsics

This also fixes a bug where the feature gating was not taking
effect at comptime due to https://github.com/ziglang/zig/issues/6768
---
 lib/std/crypto/sha2.zig | 166 ++++++++++++++++++++++++------------------------
 1 file changed, 84 insertions(+), 82 deletions(-)

diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
index 510c22b14f..9cdf8edcf1 100644
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@@ -192,94 +192,96 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                 s[i] = mem.readIntBig(u32, mem.asBytes(elem));
             }
 
-            switch (builtin.cpu.arch) {
-                .aarch64 => if (!isComptime() and comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
-                    var x: v4u32 = d.s[0..4].*;
-                    var y: v4u32 = d.s[4..8].*;
-                    const s_v = @ptrCast(*[16]v4u32, &s);
-
-                    comptime var k: u8 = 0;
-                    inline while (k < 16) : (k += 1) {
-                        if (k > 3) {
-                            s_v[k] = asm (
-                                \\sha256su0.4s %[w0_3], %[w4_7]
-                                \\sha256su1.4s %[w0_3], %[w8_11], %[w12_15]
-                                : [w0_3] "=w" (-> v4u32),
-                                : [_] "0" (s_v[k - 4]),
-                                  [w4_7] "w" (s_v[k - 3]),
-                                  [w8_11] "w" (s_v[k - 2]),
-                                  [w12_15] "w" (s_v[k - 1]),
+            if (!isComptime()) {
+                switch (builtin.cpu.arch) {
+                    .aarch64 => if (comptime std.Target.aarch64.featureSetHas(builtin.cpu.features, .sha2)) {
+                        var x: v4u32 = d.s[0..4].*;
+                        var y: v4u32 = d.s[4..8].*;
+                        const s_v = @ptrCast(*[16]v4u32, &s);
+
+                        comptime var k: u8 = 0;
+                        inline while (k < 16) : (k += 1) {
+                            if (k > 3) {
+                                s_v[k] = asm (
+                                    \\sha256su0.4s %[w0_3], %[w4_7]
+                                    \\sha256su1.4s %[w0_3], %[w8_11], %[w12_15]
+                                    : [w0_3] "=w" (-> v4u32),
+                                    : [_] "0" (s_v[k - 4]),
+                                      [w4_7] "w" (s_v[k - 3]),
+                                      [w8_11] "w" (s_v[k - 2]),
+                                      [w12_15] "w" (s_v[k - 1]),
+                                );
+                            }
+
+                            const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
+                            asm volatile (
+                                \\mov.4s v0, %[x]
+                                \\sha256h.4s %[x], %[y], %[w]
+                                \\sha256h2.4s %[y], v0, %[w]
+                                : [x] "=w" (x),
+                                  [y] "=w" (y),
+                                : [_] "0" (x),
+                                  [_] "1" (y),
+                                  [w] "w" (w),
+                                : "v0"
                             );
                         }
 
-                        const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
-                        asm volatile (
-                            \\mov.4s v0, %[x]
-                            \\sha256h.4s %[x], %[y], %[w]
-                            \\sha256h2.4s %[y], v0, %[w]
-                            : [x] "=w" (x),
-                              [y] "=w" (y),
-                            : [_] "0" (x),
-                              [_] "1" (y),
-                              [w] "w" (w),
-                            : "v0"
-                        );
-                    }
-
-                    d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*);
-                    d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
-                    return;
-                },
-                .x86_64 => if (!isComptime() and comptime builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
-                    var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
-                    var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
-                    const s_v = @ptrCast(*[16]v4u32, &s);
-
-                    comptime var k: u8 = 0;
-                    inline while (k < 16) : (k += 1) {
-                        if (k < 12) {
-                            var tmp = s_v[k];
-                            s_v[k + 4] = asm (
-                                \\ sha256msg1 %[w4_7], %[tmp]
-                                \\ vpalignr $0x4, %[w8_11], %[w12_15], %[result]
-                                \\ paddd %[tmp], %[result]
-                                \\ sha256msg2 %[w12_15], %[result]
-                                : [tmp] "=&x" (tmp),
-                                  [result] "=&x" (-> v4u32),
-                                : [_] "0" (tmp),
-                                  [w4_7] "x" (s_v[k + 1]),
-                                  [w8_11] "x" (s_v[k + 2]),
-                                  [w12_15] "x" (s_v[k + 3]),
+                        d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*);
+                        d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
+                        return;
+                    },
+                    .x86_64 => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sha)) {
+                        var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
+                        var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
+                        const s_v = @ptrCast(*[16]v4u32, &s);
+
+                        comptime var k: u8 = 0;
+                        inline while (k < 16) : (k += 1) {
+                            if (k < 12) {
+                                var tmp = s_v[k];
+                                s_v[k + 4] = asm (
+                                    \\ sha256msg1 %[w4_7], %[tmp]
+                                    \\ vpalignr $0x4, %[w8_11], %[w12_15], %[result]
+                                    \\ paddd %[tmp], %[result]
+                                    \\ sha256msg2 %[w12_15], %[result]
+                                    : [tmp] "=&x" (tmp),
+                                      [result] "=&x" (-> v4u32),
+                                    : [_] "0" (tmp),
+                                      [w4_7] "x" (s_v[k + 1]),
+                                      [w8_11] "x" (s_v[k + 2]),
+                                      [w12_15] "x" (s_v[k + 3]),
+                                );
+                            }
+
+                            const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
+                            y = asm ("sha256rnds2 %[x], %[y]"
+                                : [y] "=x" (-> v4u32),
+                                : [_] "0" (y),
+                                  [x] "x" (x),
+                                  [_] "{xmm0}" (w),
+                            );
+
+                            x = asm ("sha256rnds2 %[y], %[x]"
+                                : [x] "=x" (-> v4u32),
+                                : [_] "0" (x),
+                                  [y] "x" (y),
+                                  [_] "{xmm0}" (@bitCast(v4u32, @bitCast(u128, w) >> 64)),
                             );
                         }
 
-                        const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
-                        y = asm ("sha256rnds2 %[x], %[y]"
-                            : [y] "=x" (-> v4u32),
-                            : [_] "0" (y),
-                              [x] "x" (x),
-                              [_] "{xmm0}" (w),
-                        );
-
-                        x = asm ("sha256rnds2 %[y], %[x]"
-                            : [x] "=x" (-> v4u32),
-                            : [_] "0" (x),
-                              [y] "x" (y),
-                              [_] "{xmm0}" (@bitCast(v4u32, @bitCast(u128, w) >> 64)),
-                        );
-                    }
-
-                    d.s[0] +%= x[3];
-                    d.s[1] +%= x[2];
-                    d.s[4] +%= x[1];
-                    d.s[5] +%= x[0];
-                    d.s[2] +%= y[3];
-                    d.s[3] +%= y[2];
-                    d.s[6] +%= y[1];
-                    d.s[7] +%= y[0];
-                    return;
-                },
-                else => {},
+                        d.s[0] +%= x[3];
+                        d.s[1] +%= x[2];
+                        d.s[4] +%= x[1];
+                        d.s[5] +%= x[0];
+                        d.s[2] +%= y[3];
+                        d.s[3] +%= y[2];
+                        d.s[6] +%= y[1];
+                        d.s[7] +%= y[0];
+                        return;
+                    },
+                    else => {},
+                }
             }
 
             var i: usize = 16;
-- 
cgit v1.2.3