11 files changed, 1206 insertions, 202 deletions
diff --git a/std/crypto/blake2.zig b/std/crypto/blake2.zig
index 947133e4cf..467ddde5db 100644
--- a/std/crypto/blake2.zig
+++ b/std/crypto/blake2.zig
@@ -34,8 +34,8 @@ pub const Blake2s256 = Blake2s(256);
 fn Blake2s(comptime out_len: usize) type {
     return struct {
         const Self = this;
-        const block_size = 64;
-        const digest_size = out_len / 8;
+        const block_length = 64;
+        const digest_length = out_len / 8;
 
         const iv = [8]u32{
             0x6A09E667,
@@ -250,8 +250,8 @@ test "blake2s256 streaming" {
 }
 
 test "blake2s256 aligned final" {
-    var block = []u8{0} ** Blake2s256.block_size;
-    var out: [Blake2s256.digest_size]u8 = undefined;
+    var block = []u8{0} ** Blake2s256.block_length;
+    var out: [Blake2s256.digest_length]u8 = undefined;
 
     var h = Blake2s256.init();
     h.update(block);
@@ -267,8 +267,8 @@ pub const Blake2b512 = Blake2b(512);
 fn Blake2b(comptime out_len: usize) type {
     return struct {
         const Self = this;
-        const block_size = 128;
-        const digest_size = out_len / 8;
+        const block_length = 128;
+        const digest_length = out_len / 8;
 
         const iv = [8]u64{
             0x6a09e667f3bcc908,
@@ -483,8 +483,8 @@ test "blake2b512 streaming" {
 }
 
 test "blake2b512 aligned final" {
-    var block = []u8{0} ** Blake2b512.block_size;
-    var out: [Blake2b512.digest_size]u8 = undefined;
+    var block = []u8{0} ** Blake2b512.block_length;
+    var out: [Blake2b512.digest_length]u8 = undefined;
 
     var h = Blake2b512.init();
     h.update(block);
diff --git a/std/crypto/chacha20.zig b/std/crypto/chacha20.zig
index 836c8c8878..743d1033e3 100644
--- a/std/crypto/chacha20.zig
+++ b/std/crypto/chacha20.zig
@@ -32,24 +32,28 @@ fn salsa20_wordtobyte(out: []u8, input: [16]u32) void {
         x[i] = input[i];
 
     const rounds = comptime []QuarterRound{
-        Rp( 0, 4, 8,12),
-        Rp( 1, 5, 9,13),
-        Rp( 2, 6,10,14),
-        Rp( 3, 7,11,15),
-        Rp( 0, 5,10,15),
-        Rp( 1, 6,11,12),
-        Rp( 2, 7, 8,13),
-        Rp( 3, 4, 9,14),
+        Rp(0, 4, 8, 12),
+        Rp(1, 5, 9, 13),
+        Rp(2, 6, 10, 14),
+        Rp(3, 7, 11, 15),
+        Rp(0, 5, 10, 15),
+        Rp(1, 6, 11, 12),
+        Rp(2, 7, 8, 13),
+        Rp(3, 4, 9, 14),
     };
 
     comptime var j: usize = 0;
     inline while (j < 20) : (j += 2) {
         // two-round cycles
         inline for (rounds) |r| {
-            x[r.a] +%= x[r.b]; x[r.d] = std.math.rotl(u32, x[r.d] ^ x[r.a], u32(16));
-            x[r.c] +%= x[r.d]; x[r.b] = std.math.rotl(u32, x[r.b] ^ x[r.c], u32(12));
-            x[r.a] +%= x[r.b]; x[r.d] = std.math.rotl(u32, x[r.d] ^ x[r.a],  u32(8));
-            x[r.c] +%= x[r.d]; x[r.b] = std.math.rotl(u32, x[r.b] ^ x[r.c],  u32(7));
+            x[r.a] +%= x[r.b];
+            x[r.d] = std.math.rotl(u32, x[r.d] ^ x[r.a], u32(16));
+            x[r.c] +%= x[r.d];
+            x[r.b] = std.math.rotl(u32, x[r.b] ^ x[r.c], u32(12));
+            x[r.a] +%= x[r.b];
+            x[r.d] = std.math.rotl(u32, x[r.d] ^ x[r.a], u32(8));
+            x[r.c] +%= x[r.d];
+            x[r.b] = std.math.rotl(u32, x[r.b] ^ x[r.c], u32(7));
         }
     }
 
@@ -166,9 +170,8 @@ pub fn chaCha20With64BitNonce(out: []u8, in: []const u8, counter: u64, key: [32]
             var remaining_blocks: u32 = @intCast(u32, (in.len / big_block));
             var i: u32 = 0;
             while (remaining_blocks > 0) : (remaining_blocks -= 1) {
-                chaCha20_internal(out[cursor..cursor + big_block], in[cursor..cursor + big_block], k, c);
-                c[1] += 1; // upper 32-bit of counter, generic chaCha20_internal() doesn't
-                           // know about this.
+                chaCha20_internal(out[cursor .. cursor + big_block], in[cursor .. cursor + big_block], k, c);
+                c[1] += 1; // upper 32-bit of counter, generic chaCha20_internal() doesn't know about this.
                 cursor += big_block;
             }
         }
@@ -199,16 +202,16 @@ test "crypto.chacha20 test vector sunscreen" {
     const input = "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it.";
     var result: [114]u8 = undefined;
     const key = []u8{
-         0, 1, 2, 3, 4, 5, 6, 7,
-         8, 9,10,11,12,13,14,15,
-        16,17,18,19,20,21,22,23,
-        24,25,26,27,28,29,30,31,
+        0, 1, 2, 3, 4, 5, 6, 7,
+        8, 9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23,
+        24, 25, 26, 27, 28, 29, 30, 31,
     };
     const nonce = []u8{
-         0, 0, 0, 0,
-         0, 0, 0, 0x4a,
-         0, 0, 0, 0,
-     };
+        0, 0, 0, 0,
+        0, 0, 0, 0x4a,
+        0, 0, 0, 0,
+    };
 
     chaCha20IETF(result[0..], input[0..], 1, key, nonce);
     assert(mem.eql(u8, expected_result, result));
@@ -248,7 +251,7 @@ test "crypto.chacha20 test vector 1" {
         0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0,
     };
-    const nonce = []u8{0, 0, 0, 0, 0, 0, 0, 0};
+    const nonce = []u8{ 0, 0, 0, 0, 0, 0, 0, 0 };
 
     chaCha20With64BitNonce(result[0..], input[0..], 0, key, nonce);
     assert(mem.eql(u8, expected_result, result));
@@ -282,7 +285,7 @@ test "crypto.chacha20 test vector 2" {
         0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1,
     };
-    const nonce = []u8{0, 0, 0, 0, 0, 0, 0, 0};
+    const nonce = []u8{ 0, 0, 0, 0, 0, 0, 0, 0 };
 
     chaCha20With64BitNonce(result[0..], input[0..], 0, key, nonce);
     assert(mem.eql(u8, expected_result, result));
@@ -316,7 +319,7 @@ test "crypto.chacha20 test vector 3" {
         0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0,
     };
-    const nonce = []u8{0, 0, 0, 0, 0, 0, 0, 1};
+    const nonce = []u8{ 0, 0, 0, 0, 0, 0, 0, 1 };
 
     chaCha20With64BitNonce(result[0..], input[0..], 0, key, nonce);
     assert(mem.eql(u8, expected_result, result));
@@ -350,7 +353,7 @@ test "crypto.chacha20 test vector 4" {
         0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0,
     };
-    const nonce = []u8{1, 0, 0, 0, 0, 0, 0, 0};
+    const nonce = []u8{ 1, 0, 0, 0, 0, 0, 0, 0 };
 
     chaCha20With64BitNonce(result[0..], input[0..], 0, key, nonce);
     assert(mem.eql(u8, expected_result, result));
diff --git a/std/crypto/hmac.zig b/std/crypto/hmac.zig
index 1415e88cf4..23eeff2a00 100644
--- a/std/crypto/hmac.zig
+++ b/std/crypto/hmac.zig
@@ -7,46 +7,63 @@ pub const HmacMd5 = Hmac(crypto.Md5);
 pub const HmacSha1 = Hmac(crypto.Sha1);
 pub const HmacSha256 = Hmac(crypto.Sha256);
 
-pub fn Hmac(comptime H: type) type {
+pub fn Hmac(comptime Hash: type) type {
     return struct {
-        const digest_size = H.digest_size;
+        const Self = this;
+        pub const mac_length = Hash.digest_length;
+        pub const minimum_key_length = 0;
 
-        pub fn hash(output: []u8, key: []const u8, message: []const u8) void {
-            debug.assert(output.len >= H.digest_size);
-            debug.assert(H.digest_size <= H.block_size); // HMAC makes this assumption
-            var scratch: [H.block_size]u8 = undefined;
+        o_key_pad: [Hash.block_length]u8,
+        i_key_pad: [Hash.block_length]u8,
+        scratch: [Hash.block_length]u8,
+        hash: Hash,
+
+        // HMAC(k, m) = H(o_key_pad | H(i_key_pad | msg)) where | is concatenation
+        pub fn create(out: []u8, msg: []const u8, key: []const u8) void {
+            var ctx = Self.init(key);
+            ctx.update(msg);
+            ctx.final(out[0..]);
+        }
+
+        pub fn init(key: []const u8) Self {
+            var ctx: Self = undefined;
 
             // Normalize key length to block size of hash
-            if (key.len > H.block_size) {
-                H.hash(key, scratch[0..H.digest_size]);
-                mem.set(u8, scratch[H.digest_size..H.block_size], 0);
-            } else if (key.len < H.block_size) {
-                mem.copy(u8, scratch[0..key.len], key);
-                mem.set(u8, scratch[key.len..H.block_size], 0);
+            if (key.len > Hash.block_length) {
+                Hash.hash(key, ctx.scratch[0..mac_length]);
+                mem.set(u8, ctx.scratch[mac_length..Hash.block_length], 0);
+            } else if (key.len < Hash.block_length) {
+                mem.copy(u8, ctx.scratch[0..key.len], key);
+                mem.set(u8, ctx.scratch[key.len..Hash.block_length], 0);
             } else {
-                mem.copy(u8, scratch[0..], key);
+                mem.copy(u8, ctx.scratch[0..], key);
             }
 
-            var o_key_pad: [H.block_size]u8 = undefined;
-            for (o_key_pad) |*b, i| {
-                b.* = scratch[i] ^ 0x5c;
+            for (ctx.o_key_pad) |*b, i| {
+                b.* = ctx.scratch[i] ^ 0x5c;
             }
 
-            var i_key_pad: [H.block_size]u8 = undefined;
-            for (i_key_pad) |*b, i| {
-                b.* = scratch[i] ^ 0x36;
+            for (ctx.i_key_pad) |*b, i| {
+                b.* = ctx.scratch[i] ^ 0x36;
             }
 
-            // HMAC(k, m) = H(o_key_pad | H(i_key_pad | message)) where | is concatenation
-            var hmac = H.init();
-            hmac.update(i_key_pad[0..]);
-            hmac.update(message);
-            hmac.final(scratch[0..H.digest_size]);
+            ctx.hash = Hash.init();
+            ctx.hash.update(ctx.i_key_pad[0..]);
+            return ctx;
+        }
+
+        pub fn update(ctx: *Self, msg: []const u8) void {
+            ctx.hash.update(msg);
+        }
+
+        pub fn final(ctx: *Self, out: []u8) void {
+            debug.assert(Hash.block_length >= out.len and out.len >= mac_length);
 
-            hmac.reset();
-            hmac.update(o_key_pad[0..]);
-            hmac.update(scratch[0..H.digest_size]);
-            hmac.final(output[0..H.digest_size]);
+            ctx.hash.final(ctx.scratch[0..mac_length]);
+            ctx.hash.reset();
+            ctx.hash.update(ctx.o_key_pad[0..]);
+            ctx.hash.update(ctx.scratch[0..mac_length]);
+            ctx.hash.final(out[0..mac_length]);
         }
     };
 }
@@ -54,28 +71,28 @@ pub fn Hmac(comptime H: type) type {
 const htest = @import("test.zig");
 
 test "hmac md5" {
-    var out: [crypto.Md5.digest_size]u8 = undefined;
-    HmacMd5.hash(out[0..], "", "");
+    var out: [HmacMd5.mac_length]u8 = undefined;
+    HmacMd5.create(out[0..], "", "");
     htest.assertEqual("74e6f7298a9c2d168935f58c001bad88", out[0..]);
 
-    HmacMd5.hash(out[0..], "key", "The quick brown fox jumps over the lazy dog");
+    HmacMd5.create(out[0..], "The quick brown fox jumps over the lazy dog", "key");
     htest.assertEqual("80070713463e7749b90c2dc24911e275", out[0..]);
 }
 
 test "hmac sha1" {
-    var out: [crypto.Sha1.digest_size]u8 = undefined;
-    HmacSha1.hash(out[0..], "", "");
+    var out: [HmacSha1.mac_length]u8 = undefined;
+    HmacSha1.create(out[0..], "", "");
     htest.assertEqual("fbdb1d1b18aa6c08324b7d64b71fb76370690e1d", out[0..]);
 
-    HmacSha1.hash(out[0..], "key", "The quick brown fox jumps over the lazy dog");
+    HmacSha1.create(out[0..], "The quick brown fox jumps over the lazy dog", "key");
     htest.assertEqual("de7c9b85b8b78aa6bc8a7a36f70a90701c9db4d9", out[0..]);
 }
 
 test "hmac sha256" {
-    var out: [crypto.Sha256.digest_size]u8 = undefined;
-    HmacSha256.hash(out[0..], "", "");
+    var out: [HmacSha256.mac_length]u8 = undefined;
+    HmacSha256.create(out[0..], "", "");
     htest.assertEqual("b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad", out[0..]);
 
-    HmacSha256.hash(out[0..], "key", "The quick brown fox jumps over the lazy dog");
+    HmacSha256.create(out[0..], "The quick brown fox jumps over the lazy dog", "key");
     htest.assertEqual("f7bc83f430538424b13298e6aa6fb143ef4d59a14946175997479dbc2d1a3cd8", out[0..]);
 }
diff --git a/std/crypto/index.zig b/std/crypto/index.zig
index 1b18a5dc6f..3a88fe2b2c 100644
--- a/std/crypto/index.zig
+++ b/std/crypto/index.zig
@@ -21,19 +21,24 @@ pub const Blake2b512 = blake2.Blake2b512;
 
 const hmac = @import("hmac.zig");
 pub const HmacMd5 = hmac.HmacMd5;
-pub const HmacSha1 = hmac.Sha1;
-pub const HmacSha256 = hmac.Sha256;
+pub const HmacSha1 = hmac.HmacSha1;
+pub const HmacSha256 = hmac.HmacSha256;
 
 const import_chaCha20 = @import("chacha20.zig");
 pub const chaCha20IETF = import_chaCha20.chaCha20IETF;
 pub const chaCha20With64BitNonce = import_chaCha20.chaCha20With64BitNonce;
 
+pub const Poly1305 = @import("poly1305.zig").Poly1305;
+pub const X25519 = @import("x25519.zig").X25519;
+
 test "crypto" {
+    _ = @import("blake2.zig");
+    _ = @import("chacha20.zig");
+    _ = @import("hmac.zig");
     _ = @import("md5.zig");
+    _ = @import("poly1305.zig");
     _ = @import("sha1.zig");
     _ = @import("sha2.zig");
     _ = @import("sha3.zig");
-    _ = @import("blake2.zig");
-    _ = @import("hmac.zig");
-    _ = @import("chacha20.zig");
+    _ = @import("x25519.zig");
 }
diff --git a/std/crypto/md5.zig b/std/crypto/md5.zig
index 23fe2313a0..20334ec7d8 100644
--- a/std/crypto/md5.zig
+++ b/std/crypto/md5.zig
@@ -29,8 +29,8 @@ fn Rp(a: usize, b: usize, c: usize, d: usize, k: usize, s: u32, t: u32) RoundPar
 
 pub const Md5 = struct {
     const Self = this;
-    const block_size = 64;
-    const digest_size = 16;
+    const block_length = 64;
+    const digest_length = 16;
 
     s: [4]u32,
     // Streaming Cache
@@ -271,8 +271,8 @@ test "md5 streaming" {
 }
 
 test "md5 aligned final" {
-    var block = []u8{0} ** Md5.block_size;
-    var out: [Md5.digest_size]u8 = undefined;
+    var block = []u8{0} ** Md5.block_length;
+    var out: [Md5.digest_length]u8 = undefined;
 
     var h = Md5.init();
     h.update(block);
diff --git a/std/crypto/poly1305.zig b/std/crypto/poly1305.zig
new file mode 100644
index 0000000000..f5e11fc0a1
--- /dev/null
+++ b/std/crypto/poly1305.zig
@@ -0,0 +1,233 @@
+// Translated from monocypher which is licensed under CC-0/BSD-3.
+//
+// https://monocypher.org/
+
+const std = @import("../index.zig");
+const builtin = @import("builtin");
+
+const Endian = builtin.Endian;
+const readInt = std.mem.readInt;
+const writeInt = std.mem.writeInt;
+
+pub const Poly1305 = struct {
+    const Self = this;
+
+    pub const mac_length = 16;
+    pub const minimum_key_length = 32;
+
+    // constant multiplier (from the secret key)
+    r: [4]u32,
+    // accumulated hash
+    h: [5]u32,
+    // chunk of the message
+    c: [5]u32,
+    // random number added at the end (from the secret key)
+    pad: [4]u32,
+    // How many bytes are there in the chunk.
+    c_idx: usize,
+
+    fn secureZero(self: *Self) void {
+        std.mem.secureZero(u8, @ptrCast([*]u8, self)[0..@sizeOf(Poly1305)]);
+    }
+
+    pub fn create(out: []u8, msg: []const u8, key: []const u8) void {
+        std.debug.assert(out.len >= mac_length);
+        std.debug.assert(key.len >= minimum_key_length);
+
+        var ctx = Poly1305.init(key);
+        ctx.update(msg);
+        ctx.final(out);
+    }
+
+    // Initialize the MAC context.
+    //   - key.len is sufficient size.
+    pub fn init(key: []const u8) Self {
+        var ctx: Poly1305 = undefined;
+
+        // Initial hash is zero
+        {
+            var i: usize = 0;
+            while (i < 5) : (i += 1) {
+                ctx.h[i] = 0;
+            }
+        }
+        // add 2^130 to every input block
+        ctx.c[4] = 1;
+        polyClearC(&ctx);
+
+        // load r and pad (r has some of its bits cleared)
+        {
+            var i: usize = 0;
+            while (i < 1) : (i += 1) {
+                ctx.r[0] = readInt(key[0..4], u32, Endian.Little) & 0x0fffffff;
+            }
+        }
+        {
+            var i: usize = 1;
+            while (i < 4) : (i += 1) {
+                ctx.r[i] = readInt(key[i * 4 .. i * 4 + 4], u32, Endian.Little) & 0x0ffffffc;
+            }
+        }
+        {
+            var i: usize = 0;
+            while (i < 4) : (i += 1) {
+                ctx.pad[i] = readInt(key[i * 4 + 16 .. i * 4 + 16 + 4], u32, Endian.Little);
+            }
+        }
+
+        return ctx;
+    }
+
+    // h = (h + c) * r
+    // preconditions:
+    //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
+    //   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
+    //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
+    // Postcondition:
+    //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
+    fn polyBlock(ctx: *Self) void {
+        // s = h + c, without carry propagation
+        const s0 = u64(ctx.h[0]) + ctx.c[0]; // s0 <= 1_fffffffe
+        const s1 = u64(ctx.h[1]) + ctx.c[1]; // s1 <= 1_fffffffe
+        const s2 = u64(ctx.h[2]) + ctx.c[2]; // s2 <= 1_fffffffe
+        const s3 = u64(ctx.h[3]) + ctx.c[3]; // s3 <= 1_fffffffe
+        const s4 = u64(ctx.h[4]) + ctx.c[4]; // s4 <=          5
+
+        // Local all the things!
+        const r0 = ctx.r[0]; // r0  <= 0fffffff
+        const r1 = ctx.r[1]; // r1  <= 0ffffffc
+        const r2 = ctx.r[2]; // r2  <= 0ffffffc
+        const r3 = ctx.r[3]; // r3  <= 0ffffffc
+        const rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits...
+        const rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
+        const rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
+        const rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
+
+        // (h + c) * r, without carry propagation
+        const x0 = s0 * r0 + s1 * rr3 + s2 * rr2 + s3 * rr1 + s4 * rr0; //<=97ffffe007fffff8
+        const x1 = s0 * r1 + s1 * r0 + s2 * rr3 + s3 * rr2 + s4 * rr1; //<=8fffffe20ffffff6
+        const x2 = s0 * r2 + s1 * r1 + s2 * r0 + s3 * rr3 + s4 * rr2; //<=87ffffe417fffff4
+        const x3 = s0 * r3 + s1 * r2 + s2 * r1 + s3 * r0 + s4 * rr3; //<=7fffffe61ffffff2
+        const x4 = s4 * (r0 & 3); // ...recover 2 bits      //<=               f
+
+        // partial reduction modulo 2^130 - 5
+        const _u5 = @truncate(u32, x4 + (x3 >> 32)); // u5 <= 7ffffff5
+        const _u0 = (_u5 >> 2) * 5 + (x0 & 0xffffffff);
+        const _u1 = (_u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
+        const _u2 = (_u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
+        const _u3 = (_u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
+        const _u4 = (_u3 >> 32) + (_u5 & 3);
+
+        // Update the hash
+        ctx.h[0] = @truncate(u32, _u0); // u0 <= 1_9ffffff0
+        ctx.h[1] = @truncate(u32, _u1); // u1 <= 1_97ffffe0
+        ctx.h[2] = @truncate(u32, _u2); // u2 <= 1_8fffffe2
+        ctx.h[3] = @truncate(u32, _u3); // u3 <= 1_87ffffe4
+        ctx.h[4] = @truncate(u32, _u4); // u4 <=          4
+    }
+
+    // (re-)initializes the input counter and input buffer
+    fn polyClearC(ctx: *Self) void {
+        ctx.c[0] = 0;
+        ctx.c[1] = 0;
+        ctx.c[2] = 0;
+        ctx.c[3] = 0;
+        ctx.c_idx = 0;
+    }
+
+    fn polyTakeInput(ctx: *Self, input: u8) void {
+        const word = ctx.c_idx >> 2;
+        const byte = ctx.c_idx & 3;
+        ctx.c[word] |= std.math.shl(u32, input, byte * 8);
+        ctx.c_idx += 1;
+    }
+
+    fn polyUpdate(ctx: *Self, msg: []const u8) void {
+        for (msg) |b| {
+            polyTakeInput(ctx, b);
+            if (ctx.c_idx == 16) {
+                polyBlock(ctx);
+                polyClearC(ctx);
+            }
+        }
+    }
+
+    fn alignTo(x: usize, block_size: usize) usize {
+        return ((~x) +% 1) & (block_size - 1);
+    }
+
+    // Feed data into the MAC context.
+    pub fn update(ctx: *Self, msg: []const u8) void {
+        // Align ourselves with block boundaries
+        const alignm = std.math.min(alignTo(ctx.c_idx, 16), msg.len);
+        polyUpdate(ctx, msg[0..alignm]);
+
+        var nmsg = msg[alignm..];
+
+        // Process the msg block by block
+        const nb_blocks = nmsg.len >> 4;
+        var i: usize = 0;
+        while (i < nb_blocks) : (i += 1) {
+            ctx.c[0] = readInt(nmsg[0..4], u32, Endian.Little);
+            ctx.c[1] = readInt(nmsg[4..8], u32, Endian.Little);
+            ctx.c[2] = readInt(nmsg[8..12], u32, Endian.Little);
+            ctx.c[3] = readInt(nmsg[12..16], u32, Endian.Little);
+            polyBlock(ctx);
+            nmsg = nmsg[16..];
+        }
+        if (nb_blocks > 0) {
+            polyClearC(ctx);
+        }
+
+        // remaining bytes
+        polyUpdate(ctx, nmsg[0..]);
+    }
+
+    // Finalize the MAC and output into buffer provided by caller.
+    pub fn final(ctx: *Self, out: []u8) void {
+        // Process the last block (if any)
+        if (ctx.c_idx != 0) {
+            // move the final 1 according to remaining input length
+            // (We may add less than 2^130 to the last input block)
+            ctx.c[4] = 0;
+            polyTakeInput(ctx, 1);
+            // one last hash update
+            polyBlock(ctx);
+        }
+
+        // check if we should subtract 2^130-5 by performing the
+        // corresponding carry propagation.
+        const _u0 = u64(5) + ctx.h[0]; // <= 1_00000004
+        const _u1 = (_u0 >> 32) + ctx.h[1]; // <= 1_00000000
+        const _u2 = (_u1 >> 32) + ctx.h[2]; // <= 1_00000000
+        const _u3 = (_u2 >> 32) + ctx.h[3]; // <= 1_00000000
+        const _u4 = (_u3 >> 32) + ctx.h[4]; // <=          5
+        // u4 indicates how many times we should subtract 2^130-5 (0 or 1)
+
+        // h + pad, minus 2^130-5 if u4 exceeds 3
+        const uu0 = (_u4 >> 2) * 5 + ctx.h[0] + ctx.pad[0]; // <= 2_00000003
+        const uu1 = (uu0 >> 32) + ctx.h[1] + ctx.pad[1]; // <= 2_00000000
+        const uu2 = (uu1 >> 32) + ctx.h[2] + ctx.pad[2]; // <= 2_00000000
+        const uu3 = (uu2 >> 32) + ctx.h[3] + ctx.pad[3]; // <= 2_00000000
+
+        writeInt(out[0..], @truncate(u32, uu0), Endian.Little);
+        writeInt(out[4..], @truncate(u32, uu1), Endian.Little);
+        writeInt(out[8..], @truncate(u32, uu2), Endian.Little);
+        writeInt(out[12..], @truncate(u32, uu3), Endian.Little);
+
+        ctx.secureZero();
+    }
+};
+
+test "poly1305 rfc7439 vector1" {
+    const expected_mac = "\xa8\x06\x1d\xc1\x30\x51\x36\xc6\xc2\x2b\x8b\xaf\x0c\x01\x27\xa9";
+
+    const msg = "Cryptographic Forum Research Group";
+    const key = "\x85\xd6\xbe\x78\x57\x55\x6d\x33\x7f\x44\x52\xfe\x42\xd5\x06\xa8" ++
+        "\x01\x03\x80\x8a\xfb\x0d\xb2\xfd\x4a\xbf\xf6\xaf\x41\x49\xf5\x1b";
+
+    var mac: [16]u8 = undefined;
+    Poly1305.create(mac[0..], msg, key);
+
+    std.debug.assert(std.mem.eql(u8, mac, expected_mac));
+}
diff --git a/std/crypto/sha1.zig b/std/crypto/sha1.zig
index 451cfb3122..6d6b4dbd3f 100644
--- a/std/crypto/sha1.zig
+++ b/std/crypto/sha1.zig
@@ -26,8 +26,8 @@ fn Rp(a: usize, b: usize, c: usize, d: usize, e: usize, i: u32) RoundParam {
 
 pub const Sha1 = struct {
     const Self = this;
-    const block_size = 64;
-    const digest_size = 20;
+    const block_length = 64;
+    const digest_length = 20;
 
     s: [5]u32,
     // Streaming Cache
@@ -292,8 +292,8 @@ test "sha1 streaming" {
 }
 
 test "sha1 aligned final" {
-    var block = []u8{0} ** Sha1.block_size;
-    var out: [Sha1.digest_size]u8 = undefined;
+    var block = []u8{0} ** Sha1.block_length;
+    var out: [Sha1.digest_length]u8 = undefined;
 
     var h = Sha1.init();
     h.update(block);
diff --git a/std/crypto/sha2.zig b/std/crypto/sha2.zig
index d1b915835c..8a25fecc43 100644
--- a/std/crypto/sha2.zig
+++ b/std/crypto/sha2.zig
@@ -78,8 +78,8 @@ pub const Sha256 = Sha2_32(Sha256Params);
 fn Sha2_32(comptime params: Sha2Params32) type {
     return struct {
         const Self = this;
-        const block_size = 64;
-        const digest_size = params.out_len / 8;
+        const block_length = 64;
+        const digest_length = params.out_len / 8;
 
         s: [8]u32,
         // Streaming Cache
@@ -338,8 +338,8 @@ test "sha256 streaming" {
 }
 
 test "sha256 aligned final" {
-    var block = []u8{0} ** Sha256.block_size;
-    var out: [Sha256.digest_size]u8 = undefined;
+    var block = []u8{0} ** Sha256.block_length;
+    var out: [Sha256.digest_length]u8 = undefined;
 
     var h = Sha256.init();
     h.update(block);
@@ -419,8 +419,8 @@ pub const Sha512 = Sha2_64(Sha512Params);
 fn Sha2_64(comptime params: Sha2Params64) type {
     return struct {
         const Self = this;
-        const block_size = 128;
-        const digest_size = params.out_len / 8;
+        const block_length = 128;
+        const digest_length = params.out_len / 8;
 
         s: [8]u64,
         // Streaming Cache
@@ -715,8 +715,8 @@ test "sha512 streaming" {
 }
 
 test "sha512 aligned final" {
-    var block = []u8{0} ** Sha512.block_size;
-    var out: [Sha512.digest_size]u8 = undefined;
+    var block = []u8{0} ** Sha512.block_length;
+    var out: [Sha512.digest_length]u8 = undefined;
 
     var h = Sha512.init();
     h.update(block);
diff --git a/std/crypto/sha3.zig b/std/crypto/sha3.zig
index ae02d7a482..827bbd0680 100644
--- a/std/crypto/sha3.zig
+++ b/std/crypto/sha3.zig
@@ -13,8 +13,8 @@ pub const Sha3_512 = Keccak(512, 0x06);
 fn Keccak(comptime bits: usize, comptime delim: u8) type {
     return struct {
         const Self = this;
-        const block_size = 200;
-        const digest_size = bits / 8;
+        const block_length = 200;
+        const digest_length = bits / 8;
 
         s: [200]u8,
         offset: usize,
@@ -87,97 +87,24 @@ fn Keccak(comptime bits: usize, comptime delim: u8) type {
 }
 
 const RC = []const u64{
-    0x0000000000000001,
-    0x0000000000008082,
-    0x800000000000808a,
-    0x8000000080008000,
-    0x000000000000808b,
-    0x0000000080000001,
-    0x8000000080008081,
-    0x8000000000008009,
-    0x000000000000008a,
-    0x0000000000000088,
-    0x0000000080008009,
-    0x000000008000000a,
-    0x000000008000808b,
-    0x800000000000008b,
-    0x8000000000008089,
-    0x8000000000008003,
-    0x8000000000008002,
-    0x8000000000000080,
-    0x000000000000800a,
-    0x800000008000000a,
-    0x8000000080008081,
-    0x8000000000008080,
-    0x0000000080000001,
-    0x8000000080008008,
+    0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
+    0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
+    0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+    0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
+    0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
+    0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
 };
 
 const ROTC = []const usize{
-    1,
-    3,
-    6,
-    10,
-    15,
-    21,
-    28,
-    36,
-    45,
-    55,
-    2,
-    14,
-    27,
-    41,
-    56,
-    8,
-    25,
-    43,
-    62,
-    18,
-    39,
-    61,
-    20,
-    44,
+    1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
 };
 
 const PIL = []const usize{
-    10,
-    7,
-    11,
-    17,
-    18,
-    3,
-    5,
-    16,
-    8,
-    21,
-    24,
-    4,
-    15,
-    23,
-    19,
-    13,
-    12,
-    2,
-    20,
-    14,
-    22,
-    9,
-    6,
-    1,
+    10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
 };
 
 const M5 = []const usize{
-    0,
-    1,
-    2,
-    3,
-    4,
-    0,
-    1,
-    2,
-    3,
-    4,
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
 };
 
 fn keccak_f(comptime F: usize, d: []u8) void {
@@ -297,8 +224,8 @@ test "sha3-256 streaming" {
 }
 
 test "sha3-256 aligned final" {
-    var block = []u8{0} ** Sha3_256.block_size;
-    var out: [Sha3_256.digest_size]u8 = undefined;
+    var block = []u8{0} ** Sha3_256.block_length;
+    var out: [Sha3_256.digest_length]u8 = undefined;
 
     var h = Sha3_256.init();
     h.update(block);
@@ -368,8 +295,8 @@ test "sha3-512 streaming" {
 }
 
 test "sha3-512 aligned final" {
-    var block = []u8{0} ** Sha3_512.block_size;
-    var out: [Sha3_512.digest_size]u8 = undefined;
+    var block = []u8{0} ** Sha3_512.block_length;
+    var out: [Sha3_512.digest_length]u8 = undefined;
 
     var h = Sha3_512.init();
     h.update(block);
diff --git a/std/crypto/throughput_test.zig b/std/crypto/throughput_test.zig
index c21838e607..294ef5df51 100644
--- a/std/crypto/throughput_test.zig
+++ b/std/crypto/throughput_test.zig
@@ -1,38 +1,193 @@
-// Modify the HashFunction variable to the one wanted to test.
-//
-// ```
-// zig build-exe --release-fast throughput_test.zig
-// ./throughput_test
-// ```
-
+const builtin = @import("builtin");
 const std = @import("std");
 const time = std.os.time;
 const Timer = time.Timer;
-const HashFunction = @import("md5.zig").Md5;
+const crypto = @import("index.zig");
 
-const MiB = 1024 * 1024;
-const BytesToHash = 1024 * MiB;
+const KiB = 1024;
+const MiB = 1024 * KiB;
 
-pub fn main() !void {
-    var stdout_file = try std.io.getStdOut();
-    var stdout_out_stream = std.io.FileOutStream.init(&stdout_file);
-    const stdout = &stdout_out_stream.stream;
+var prng = std.rand.DefaultPrng.init(0);
 
-    var block: [HashFunction.block_size]u8 = undefined;
-    std.mem.set(u8, block[0..], 0);
+const Crypto = struct {
+    ty: type,
+    name: []const u8,
+};
 
-    var h = HashFunction.init();
-    var offset: usize = 0;
+const hashes = []Crypto{
+    Crypto{ .ty = crypto.Md5, .name = "md5" },
+    Crypto{ .ty = crypto.Sha1, .name = "sha1" },
+    Crypto{ .ty = crypto.Sha256, .name = "sha256" },
+    Crypto{ .ty = crypto.Sha512, .name = "sha512" },
+    Crypto{ .ty = crypto.Sha3_256, .name = "sha3-256" },
+    Crypto{ .ty = crypto.Sha3_512, .name = "sha3-512" },
+    Crypto{ .ty = crypto.Blake2s256, .name = "blake2s" },
+    Crypto{ .ty = crypto.Blake2b512, .name = "blake2b" },
+};
+
+pub fn benchmarkHash(comptime Hash: var, comptime bytes: comptime_int) !u64 {
+    var h = Hash.init();
+
+    var block: [Hash.digest_length]u8 = undefined;
+    prng.random.bytes(block[0..]);
 
+    var offset: usize = 0;
     var timer = try Timer.start();
     const start = timer.lap();
-    while (offset < BytesToHash) : (offset += block.len) {
+    while (offset < bytes) : (offset += block.len) {
         h.update(block[0..]);
     }
     const end = timer.read();
 
     const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
-    const throughput = @floatToInt(u64, BytesToHash / elapsed_s);
+    const throughput = @floatToInt(u64, bytes / elapsed_s);
+
+    return throughput;
+}
+
+const macs = []Crypto{
+    Crypto{ .ty = crypto.Poly1305, .name = "poly1305" },
+    Crypto{ .ty = crypto.HmacMd5, .name = "hmac-md5" },
+    Crypto{ .ty = crypto.HmacSha1, .name = "hmac-sha1" },
+    Crypto{ .ty = crypto.HmacSha256, .name = "hmac-sha256" },
+};
+
+pub fn benchmarkMac(comptime Mac: var, comptime bytes: comptime_int) !u64 {
+    std.debug.assert(32 >= Mac.mac_length and 32 >= Mac.minimum_key_length);
+
+    var in: [1 * MiB]u8 = undefined;
+    prng.random.bytes(in[0..]);
+
+    var key: [32]u8 = undefined;
+    prng.random.bytes(key[0..]);
+
+    var offset: usize = 0;
+    var timer = try Timer.start();
+    const start = timer.lap();
+    while (offset < bytes) : (offset += in.len) {
+        Mac.create(key[0..], in[0..], key);
+    }
+    const end = timer.read();
+
+    const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
+    const throughput = @floatToInt(u64, bytes / elapsed_s);
+
+    return throughput;
+}
+
+const exchanges = []Crypto{Crypto{ .ty = crypto.X25519, .name = "x25519" }};
+
+pub fn benchmarkKeyExchange(comptime DhKeyExchange: var, comptime exchange_count: comptime_int) !u64 {
+    std.debug.assert(DhKeyExchange.minimum_key_length >= DhKeyExchange.secret_length);
+
+    var in: [DhKeyExchange.minimum_key_length]u8 = undefined;
+    prng.random.bytes(in[0..]);
+
+    var out: [DhKeyExchange.minimum_key_length]u8 = undefined;
+    prng.random.bytes(out[0..]);
+
+    var offset: usize = 0;
+    var timer = try Timer.start();
+    const start = timer.lap();
+    {
+        var i: usize = 0;
+        while (i < exchange_count) : (i += 1) {
+            _ = DhKeyExchange.create(out[0..], out, in);
+        }
+    }
+    const end = timer.read();
+
+    const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
+    const throughput = @floatToInt(u64, exchange_count / elapsed_s);
+
+    return throughput;
+}
 
-    try stdout.print("{}: {} MiB/s\n", @typeName(HashFunction), throughput / (1 * MiB));
+fn usage() void {
+    std.debug.warn(
+        \\throughput_test [options]
+        \\
+        \\Options:
+        \\  --filter [test-name]
+        \\  --seed   [int]
+        \\  --help
+        \\
+    );
+}
+
+fn mode(comptime x: comptime_int) comptime_int {
+    return if (builtin.mode == builtin.Mode.Debug) x / 64 else x;
+}
+
+// TODO(#1358): Replace with builtin formatted padding when available.
+fn printPad(stdout: var, s: []const u8) !void {
+    var i: usize = 0;
+    while (i < 12 - s.len) : (i += 1) {
+        try stdout.print(" ");
+    }
+    try stdout.print("{}", s);
+}
+
+pub fn main() !void {
+    var stdout_file = try std.io.getStdOut();
+    var stdout_out_stream = std.io.FileOutStream.init(&stdout_file);
+    const stdout = &stdout_out_stream.stream;
+
+    var buffer: [1024]u8 = undefined;
+    var fixed = std.heap.FixedBufferAllocator.init(buffer[0..]);
+    const args = try std.os.argsAlloc(&fixed.allocator);
+
+    var filter: ?[]u8 = "";
+
+    var i: usize = 1;
+    while (i < args.len) : (i += 1) {
+        if (std.mem.eql(u8, args[i], "--seed")) {
+            i += 1;
+            if (i == args.len) {
+                usage();
+                std.os.exit(1);
+            }
+
+            const seed = try std.fmt.parseUnsigned(u32, args[i], 10);
+            prng.seed(seed);
+        } else if (std.mem.eql(u8, args[i], "--filter")) {
+            i += 1;
+            if (i == args.len) {
+                usage();
+                std.os.exit(1);
+            }
+
+            filter = args[i];
+        } else if (std.mem.eql(u8, args[i], "--help")) {
+            usage();
+            return;
+        } else {
+            usage();
+            std.os.exit(1);
+        }
+    }
+
+    inline for (hashes) |H| {
+        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) {
+            const throughput = try benchmarkHash(H.ty, mode(32 * MiB));
+            try printPad(stdout, H.name);
+            try stdout.print(": {} MiB/s\n", throughput / (1 * MiB));
+        }
+    }
+
+    inline for (macs) |M| {
+        if (filter == null or std.mem.indexOf(u8, M.name, filter.?) != null) {
+            const throughput = try benchmarkMac(M.ty, mode(128 * MiB));
+            try printPad(stdout, M.name);
+            try stdout.print(": {} MiB/s\n", throughput / (1 * MiB));
+        }
+    }
+
+    inline for (exchanges) |E| {
+        if (filter == null or std.mem.indexOf(u8, E.name, filter.?) != null) {
+            const throughput = try benchmarkKeyExchange(E.ty, mode(1000));
+            try printPad(stdout, E.name);
+            try stdout.print(": {} exchanges/s\n", throughput);
+        }
+    }
 }
diff --git a/std/crypto/x25519.zig b/std/crypto/x25519.zig
new file mode 100644
index 0000000000..16ec09f66e
--- /dev/null
+++ b/std/crypto/x25519.zig
@@ -0,0 +1,664 @@
+// Translated from monocypher which is licensed under CC-0/BSD-3.
+//
+// https://monocypher.org/
+
+const std = @import("../index.zig");
+const builtin = @import("builtin");
+
+const Endian = builtin.Endian;
+const readInt = std.mem.readInt;
+const writeInt = std.mem.writeInt;
+
+// Based on Supercop's ref10 implementation.
+pub const X25519 = struct {
+    pub const secret_length = 32;
+    pub const minimum_key_length = 32;
+
+    fn trimScalar(s: []u8) void {
+        s[0] &= 248;
+        s[31] &= 127;
+        s[31] |= 64;
+    }
+
+    fn scalarBit(s: []const u8, i: usize) i32 {
+        return (s[i >> 3] >> @intCast(u3, i & 7)) & 1;
+    }
+
+    pub fn create(out: []u8, private_key: []const u8, public_key: []const u8) bool {
+        std.debug.assert(out.len >= secret_length);
+        std.debug.assert(private_key.len >= minimum_key_length);
+        std.debug.assert(public_key.len >= minimum_key_length);
+
+        var storage: [7]Fe = undefined;
+        var x1 = &storage[0];
+        var x2 = &storage[1];
+        var z2 = &storage[2];
+        var x3 = &storage[3];
+        var z3 = &storage[4];
+        var t0 = &storage[5];
+        var t1 = &storage[6];
+
+        // computes the scalar product
+        Fe.fromBytes(x1, public_key);
+
+        // restrict the possible scalar values
+        var e: [32]u8 = undefined;
+        for (e[0..]) |_, i| {
+            e[i] = private_key[i];
+        }
+        trimScalar(e[0..]);
+
+        // computes the actual scalar product (the result is in x2 and z2)
+
+        // Montgomery ladder
+        // In projective coordinates, to avoid divisons: x = X / Z
+        // We don't care about the y coordinate, it's only 1 bit of information
+        Fe.init1(x2);
+        Fe.init0(z2); // "zero" point
+        Fe.copy(x3, x1);
+        Fe.init1(z3);
+
+        var swap: i32 = 0;
+        var pos: isize = 254;
+        while (pos >= 0) : (pos -= 1) {
+            // constant time conditional swap before ladder step
+            const b = scalarBit(e, @intCast(usize, pos));
+            swap ^= b; // xor trick avoids swapping at the end of the loop
+            Fe.cswap(x2, x3, swap);
+            Fe.cswap(z2, z3, swap);
+            swap = b; // anticipates one last swap after the loop
+
+            // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
+            // with differential addition
+            Fe.sub(t0, x3, z3);
+            Fe.sub(t1, x2, z2);
+            Fe.add(x2, x2, z2);
+            Fe.add(z2, x3, z3);
+            Fe.mul(z3, t0, x2);
+            Fe.mul(z2, z2, t1);
+            Fe.sq(t0, t1);
+            Fe.sq(t1, x2);
+            Fe.add(x3, z3, z2);
+            Fe.sub(z2, z3, z2);
+            Fe.mul(x2, t1, t0);
+            Fe.sub(t1, t1, t0);
+            Fe.sq(z2, z2);
+            Fe.mulSmall(z3, t1, 121666);
+            Fe.sq(x3, x3);
+            Fe.add(t0, t0, z3);
+            Fe.mul(z3, x1, z2);
+            Fe.mul(z2, t1, t0);
+        }
+
+        // last swap is necessary to compensate for the xor trick
+        // Note: after this swap, P3 == P2 + P1.
+        Fe.cswap(x2, x3, swap);
+        Fe.cswap(z2, z3, swap);
+
+        // normalises the coordinates: x == X / Z
+        Fe.invert(z2, z2);
+        Fe.mul(x2, x2, z2);
+        Fe.toBytes(out, x2);
+
+        x1.secureZero();
+        x2.secureZero();
+        x3.secureZero();
+        t0.secureZero();
+        t1.secureZero();
+        z2.secureZero();
+        z3.secureZero();
+        std.mem.secureZero(u8, e[0..]);
+
+        // Returns false if the output is all zero
+        // (happens with some malicious public keys)
+        return !zerocmp(u8, out);
+    }
+
+    pub fn createPublicKey(public_key: []const u8, private_key: []const u8) bool {
+        var base_point = []u8{9} ++ []u8{0} ** 31;
+        return create(public_key, private_key, base_point);
+    }
+};
+
+// Constant time compare to zero.
+fn zerocmp(comptime T: type, a: []const T) bool {
+    var s: T = 0;
+    for (a) |b| {
+        s |= b;
+    }
+    return s == 0;
+}
+
+////////////////////////////////////
+/// Arithmetic modulo 2^255 - 19 ///
+////////////////////////////////////
+//  Taken from Supercop's ref10 implementation.
+//  A bit bigger than TweetNaCl, over 4 times faster.
+
+// field element
+const Fe = struct {
+    b: [10]i32,
+
+    fn secureZero(self: *Fe) void {
+        std.mem.secureZero(u8, @ptrCast([*]u8, self)[0..@sizeOf(Fe)]);
+    }
+
+    fn init0(h: *Fe) void {
+        for (h.b) |*e| {
+            e.* = 0;
+        }
+    }
+
+    fn init1(h: *Fe) void {
+        for (h.b[1..]) |*e| {
+            e.* = 0;
+        }
+        h.b[0] = 1;
+    }
+
+    fn copy(h: *Fe, f: *const Fe) void {
+        for (h.b) |_, i| {
+            h.b[i] = f.b[i];
+        }
+    }
+
+    fn neg(h: *Fe, f: *const Fe) void {
+        for (h.b) |_, i| {
+            h.b[i] = -f.b[i];
+        }
+    }
+
+    fn add(h: *Fe, f: *const Fe, g: *const Fe) void {
+        for (h.b) |_, i| {
+            h.b[i] = f.b[i] + g.b[i];
+        }
+    }
+
+    fn sub(h: *Fe, f: *const Fe, g: *const Fe) void {
+        for (h.b) |_, i| {
+            h.b[i] = f.b[i] - g.b[i];
+        }
+    }
+
+    fn cswap(f: *Fe, g: *Fe, b: i32) void {
+        for (f.b) |_, i| {
+            const x = (f.b[i] ^ g.b[i]) & -b;
+            f.b[i] ^= x;
+            g.b[i] ^= x;
+        }
+    }
+
+    fn ccopy(f: *Fe, g: *const Fe, b: i32) void {
+        for (f.b) |_, i| {
+            const x = (f.b[i] ^ g.b[i]) & -b;
+            f.b[i] ^= x;
+        }
+    }
+
+    inline fn carryRound(c: []i64, t: []i64, comptime i: comptime_int, comptime shift: comptime_int, comptime mult: comptime_int) void {
+        const j = (i + 1) % 10;
+
+        c[i] = (t[i] + (i64(1) << shift)) >> (shift + 1);
+        t[j] += c[i] * mult;
+        t[i] -= c[i] * (i64(1) << (shift + 1));
+    }
+
+    fn carry1(h: *Fe, t: []i64) void {
+        var c: [10]i64 = undefined;
+
+        var sc = c[0..];
+        var st = t[0..];
+
+        carryRound(sc, st, 9, 24, 19);
+        carryRound(sc, st, 1, 24, 1);
+        carryRound(sc, st, 3, 24, 1);
+        carryRound(sc, st, 5, 24, 1);
+        carryRound(sc, st, 7, 24, 1);
+        carryRound(sc, st, 0, 25, 1);
+        carryRound(sc, st, 2, 25, 1);
+        carryRound(sc, st, 4, 25, 1);
+        carryRound(sc, st, 6, 25, 1);
+        carryRound(sc, st, 8, 25, 1);
+
+        for (h.b) |_, i| {
+            h.b[i] = @intCast(i32, t[i]);
+        }
+    }
+
+    fn carry2(h: *Fe, t: []i64) void {
+        var c: [10]i64 = undefined;
+
+        var sc = c[0..];
+        var st = t[0..];
+
+        carryRound(sc, st, 0, 25, 1);
+        carryRound(sc, st, 4, 25, 1);
+        carryRound(sc, st, 1, 24, 1);
+        carryRound(sc, st, 5, 24, 1);
+        carryRound(sc, st, 2, 25, 1);
+        carryRound(sc, st, 6, 25, 1);
+        carryRound(sc, st, 3, 24, 1);
+        carryRound(sc, st, 7, 24, 1);
+        carryRound(sc, st, 4, 25, 1);
+        carryRound(sc, st, 8, 25, 1);
+        carryRound(sc, st, 9, 24, 19);
+        carryRound(sc, st, 0, 25, 1);
+
+        for (h.b) |_, i| {
+            h.b[i] = @intCast(i32, t[i]);
+        }
+    }
+
+    fn fromBytes(h: *Fe, s: []const u8) void {
+        std.debug.assert(s.len >= 32);
+
+        var t: [10]i64 = undefined;
+
+        t[0] = readInt(s[0..4], u32, Endian.Little);
+        t[1] = readInt(s[4..7], u32, Endian.Little) << 6;
+        t[2] = readInt(s[7..10], u32, Endian.Little) << 5;
+        t[3] = readInt(s[10..13], u32, Endian.Little) << 3;
+        t[4] = readInt(s[13..16], u32, Endian.Little) << 2;
+        t[5] = readInt(s[16..20], u32, Endian.Little);
+        t[6] = readInt(s[20..23], u32, Endian.Little) << 7;
+        t[7] = readInt(s[23..26], u32, Endian.Little) << 5;
+        t[8] = readInt(s[26..29], u32, Endian.Little) << 4;
+        t[9] = (readInt(s[29..32], u32, Endian.Little) & 0x7fffff) << 2;
+
+        carry1(h, t[0..]);
+    }
+
+    fn mulSmall(h: *Fe, f: *const Fe, comptime g: comptime_int) void {
+        var t: [10]i64 = undefined;
+
+        for (t[0..]) |_, i| {
+            t[i] = i64(f.b[i]) * g;
+        }
+
+        carry1(h, t[0..]);
+    }
+
+    fn mul(h: *Fe, f1: *const Fe, g1: *const Fe) void {
+        const f = f1.b;
+        const g = g1.b;
+
+        var F: [10]i32 = undefined;
+        var G: [10]i32 = undefined;
+
+        F[1] = f[1] * 2;
+        F[3] = f[3] * 2;
+        F[5] = f[5] * 2;
+        F[7] = f[7] * 2;
+        F[9] = f[9] * 2;
+
+        G[1] = g[1] * 19;
+        G[2] = g[2] * 19;
+        G[3] = g[3] * 19;
+        G[4] = g[4] * 19;
+        G[5] = g[5] * 19;
+        G[6] = g[6] * 19;
+        G[7] = g[7] * 19;
+        G[8] = g[8] * 19;
+        G[9] = g[9] * 19;
+
+        // t's become h
+        var t: [10]i64 = undefined;
+
+        t[0] = f[0] * i64(g[0]) + F[1] * i64(G[9]) + f[2] * i64(G[8]) + F[3] * i64(G[7]) + f[4] * i64(G[6]) + F[5] * i64(G[5]) + f[6] * i64(G[4]) + F[7] * i64(G[3]) + f[8] * i64(G[2]) + F[9] * i64(G[1]);
+        t[1] = f[0] * i64(g[1]) + f[1] * i64(g[0]) + f[2] * i64(G[9]) + f[3] * i64(G[8]) + f[4] * i64(G[7]) + f[5] * i64(G[6]) + f[6] * i64(G[5]) + f[7] * i64(G[4]) + f[8] * i64(G[3]) + f[9] * i64(G[2]);
+        t[2] = f[0] * i64(g[2]) + F[1] * i64(g[1]) + f[2] * i64(g[0]) + F[3] * i64(G[9]) + f[4] * i64(G[8]) + F[5] * i64(G[7]) + f[6] * i64(G[6]) + F[7] * i64(G[5]) + f[8] * i64(G[4]) + F[9] * i64(G[3]);
+        t[3] = f[0] * i64(g[3]) + f[1] * i64(g[2]) + f[2] * i64(g[1]) + f[3] * i64(g[0]) + f[4] * i64(G[9]) + f[5] * i64(G[8]) + f[6] * i64(G[7]) + f[7] * i64(G[6]) + f[8] * i64(G[5]) + f[9] * i64(G[4]);
+        t[4] = f[0] * i64(g[4]) + F[1] * i64(g[3]) + f[2] * i64(g[2]) + F[3] * i64(g[1]) + f[4] * i64(g[0]) + F[5] * i64(G[9]) + f[6] * i64(G[8]) + F[7] * i64(G[7]) + f[8] * i64(G[6]) + F[9] * i64(G[5]);
+        t[5] = f[0] * i64(g[5]) + f[1] * i64(g[4]) + f[2] * i64(g[3]) + f[3] * i64(g[2]) + f[4] * i64(g[1]) + f[5] * i64(g[0]) + f[6] * i64(G[9]) + f[7] * i64(G[8]) + f[8] * i64(G[7]) + f[9] * i64(G[6]);
+        t[6] = f[0] * i64(g[6]) + F[1] * i64(g[5]) + f[2] * i64(g[4]) + F[3] * i64(g[3]) + f[4] * i64(g[2]) + F[5] * i64(g[1]) + f[6] * i64(g[0]) + F[7] * i64(G[9]) + f[8] * i64(G[8]) + F[9] * i64(G[7]);
+        t[7] = f[0] * i64(g[7]) + f[1] * i64(g[6]) + f[2] * i64(g[5]) + f[3] * i64(g[4]) + f[4] * i64(g[3]) + f[5] * i64(g[2]) + f[6] * i64(g[1]) + f[7] * i64(g[0]) + f[8] * i64(G[9]) + f[9] * i64(G[8]);
+        t[8] = f[0] * i64(g[8]) + F[1] * i64(g[7]) + f[2] * i64(g[6]) + F[3] * i64(g[5]) + f[4] * i64(g[4]) + F[5] * i64(g[3]) + f[6] * i64(g[2]) + F[7] * i64(g[1]) + f[8] * i64(g[0]) + F[9] * i64(G[9]);
+        t[9] = f[0] * i64(g[9]) + f[1] * i64(g[8]) + f[2] * i64(g[7]) + f[3] * i64(g[6]) + f[4] * i64(g[5]) + f[5] * i64(g[4]) + f[6] * i64(g[3]) + f[7] * i64(g[2]) + f[8] * i64(g[1]) + f[9] * i64(g[0]);
+
+        carry2(h, t[0..]);
+    }
+
+    // we could use Fe.mul() for this, but this is significantly faster
+    fn sq(h: *Fe, fz: *const Fe) void {
+        const f0 = fz.b[0];
+        const f1 = fz.b[1];
+        const f2 = fz.b[2];
+        const f3 = fz.b[3];
+        const f4 = fz.b[4];
+        const f5 = fz.b[5];
+        const f6 = fz.b[6];
+        const f7 = fz.b[7];
+        const f8 = fz.b[8];
+        const f9 = fz.b[9];
+
+        const f0_2 = f0 * 2;
+        const f1_2 = f1 * 2;
+        const f2_2 = f2 * 2;
+        const f3_2 = f3 * 2;
+        const f4_2 = f4 * 2;
+        const f5_2 = f5 * 2;
+        const f6_2 = f6 * 2;
+        const f7_2 = f7 * 2;
+        const f5_38 = f5 * 38;
+        const f6_19 = f6 * 19;
+        const f7_38 = f7 * 38;
+        const f8_19 = f8 * 19;
+        const f9_38 = f9 * 38;
+
+        var t: [10]i64 = undefined;
+
+        t[0] = f0 * i64(f0) + f1_2 * i64(f9_38) + f2_2 * i64(f8_19) + f3_2 * i64(f7_38) + f4_2 * i64(f6_19) + f5 * i64(f5_38);
+        t[1] = f0_2 * i64(f1) + f2 * i64(f9_38) + f3_2 * i64(f8_19) + f4 * i64(f7_38) + f5_2 * i64(f6_19);
+        t[2] = f0_2 * i64(f2) + f1_2 * i64(f1) + f3_2 * i64(f9_38) + f4_2 * i64(f8_19) + f5_2 * i64(f7_38) + f6 * i64(f6_19);
+        t[3] = f0_2 * i64(f3) + f1_2 * i64(f2) + f4 * i64(f9_38) + f5_2 * i64(f8_19) + f6 * i64(f7_38);
+        t[4] = f0_2 * i64(f4) + f1_2 * i64(f3_2) + f2 * i64(f2) + f5_2 * i64(f9_38) + f6_2 * i64(f8_19) + f7 * i64(f7_38);
+        t[5] = f0_2 * i64(f5) + f1_2 * i64(f4) + f2_2 * i64(f3) + f6 * i64(f9_38) + f7_2 * i64(f8_19);
+        t[6] = f0_2 * i64(f6) + f1_2 * i64(f5_2) + f2_2 * i64(f4) + f3_2 * i64(f3) + f7_2 * i64(f9_38) + f8 * i64(f8_19);
+        t[7] = f0_2 * i64(f7) + f1_2 * i64(f6) + f2_2 * i64(f5) + f3_2 * i64(f4) + f8 * i64(f9_38);
+        t[8] = f0_2 * i64(f8) + f1_2 * i64(f7_2) + f2_2 * i64(f6) + f3_2 * i64(f5_2) + f4 * i64(f4) + f9 * i64(f9_38);
+        t[9] = f0_2 * i64(f9) + f1_2 * i64(f8) + f2_2 * i64(f7) + f3_2 * i64(f6) + f4 * i64(f5_2);
+
+        carry2(h, t[0..]);
+    }
+
+    fn sq2(h: *Fe, f: *const Fe) void {
+        Fe.sq(h, f);
+        Fe.mul_small(h, h, 2);
+    }
+
+    // This could be simplified, but it would be slower
+    fn invert(out: *Fe, z: *const Fe) void {
+        var i: usize = undefined;
+
+        var t: [4]Fe = undefined;
+        var t0 = &t[0];
+        var t1 = &t[1];
+        var t2 = &t[2];
+        var t3 = &t[3];
+
+        Fe.sq(t0, z);
+        Fe.sq(t1, t0);
+        Fe.sq(t1, t1);
+        Fe.mul(t1, z, t1);
+        Fe.mul(t0, t0, t1);
+
+        Fe.sq(t2, t0);
+        Fe.mul(t1, t1, t2);
+
+        Fe.sq(t2, t1);
+        i = 1;
+        while (i < 5) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t1, t2, t1);
+
+        Fe.sq(t2, t1);
+        i = 1;
+        while (i < 10) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t2, t2, t1);
+
+        Fe.sq(t3, t2);
+        i = 1;
+        while (i < 20) : (i += 1) Fe.sq(t3, t3);
+        Fe.mul(t2, t3, t2);
+
+        Fe.sq(t2, t2);
+        i = 1;
+        while (i < 10) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t1, t2, t1);
+
+        Fe.sq(t2, t1);
+        i = 1;
+        while (i < 50) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t2, t2, t1);
+
+        Fe.sq(t3, t2);
+        i = 1;
+        while (i < 100) : (i += 1) Fe.sq(t3, t3);
+        Fe.mul(t2, t3, t2);
+
+        Fe.sq(t2, t2);
+        i = 1;
+        while (i < 50) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t1, t2, t1);
+
+        Fe.sq(t1, t1);
+        i = 1;
+        while (i < 5) : (i += 1) Fe.sq(t1, t1);
+        Fe.mul(out, t1, t0);
+
+        t0.secureZero();
+        t1.secureZero();
+        t2.secureZero();
+        t3.secureZero();
+    }
+
+    // This could be simplified, but it would be slower
+    fn pow22523(out: *Fe, z: *const Fe) void {
+        var i: usize = undefined;
+
+        var t: [3]Fe = undefined;
+        var t0 = &t[0];
+        var t1 = &t[1];
+        var t2 = &t[2];
+
+        Fe.sq(t0, z);
+        Fe.sq(t1, t0);
+        Fe.sq(t1, t1);
+        Fe.mul(t1, z, t1);
+        Fe.mul(t0, t0, t1);
+
+        Fe.sq(t0, t0);
+        Fe.mul(t0, t1, t0);
+
+        Fe.sq(t1, t0);
+        i = 1;
+        while (i < 5) : (i += 1) Fe.sq(t1, t1);
+        Fe.mul(t0, t1, t0);
+
+        Fe.sq(t1, t0);
+        i = 1;
+        while (i < 10) : (i += 1) Fe.sq(t1, t1);
+        Fe.mul(t1, t1, t0);
+
+        Fe.sq(t2, t1);
+        i = 1;
+        while (i < 20) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t1, t2, t1);
+
+        Fe.sq(t1, t1);
+        i = 1;
+        while (i < 10) : (i += 1) Fe.sq(t1, t1);
+        Fe.mul(t0, t1, t0);
+
+        Fe.sq(t1, t0);
+        i = 1;
+        while (i < 50) : (i += 1) Fe.sq(t1, t1);
+        Fe.mul(t1, t1, t0);
+
+        Fe.sq(t2, t1);
+        i = 1;
+        while (i < 100) : (i += 1) Fe.sq(t2, t2);
+        Fe.mul(t1, t2, t1);
+
+        Fe.sq(t1, t1);
+        i = 1;
+        while (i < 50) : (i += 1) Fe.sq(t1, t1);
+        Fe.mul(t0, t1, t0);
+
+        Fe.sq(t0, t0);
+        i = 1;
+        while (i < 2) : (i += 1) Fe.sq(t0, t0);
+        Fe.mul(out, t0, z);
+
+        t0.secureZero();
+        t1.secureZero();
+        t2.secureZero();
+    }
+
+    inline fn toBytesRound(c: []i64, t: []i64, comptime i: comptime_int, comptime shift: comptime_int) void {
+        c[i] = t[i] >> shift;
+        if (i + 1 < 10) {
+            t[i + 1] += c[i];
+        }
+        t[i] -= c[i] * (i32(1) << shift);
+    }
+
+    fn toBytes(s: []u8, h: *const Fe) void {
+        std.debug.assert(s.len >= 32);
+
+        var t: [10]i64 = undefined;
+        for (h.b[0..]) |_, i| {
+            t[i] = h.b[i];
+        }
+
+        var q = (19 * t[9] + ((i32(1) << 24))) >> 25;
+        {
+            var i: usize = 0;
+            while (i < 5) : (i += 1) {
+                q += t[2 * i];
+                q >>= 26;
+                q += t[2 * i + 1];
+                q >>= 25;
+            }
+        }
+        t[0] += 19 * q;
+
+        var c: [10]i64 = undefined;
+
+        var st = t[0..];
+        var sc = c[0..];
+
+        toBytesRound(sc, st, 0, 26);
+        toBytesRound(sc, st, 1, 25);
+        toBytesRound(sc, st, 2, 26);
+        toBytesRound(sc, st, 3, 25);
+        toBytesRound(sc, st, 4, 26);
+        toBytesRound(sc, st, 5, 25);
+        toBytesRound(sc, st, 6, 26);
+        toBytesRound(sc, st, 7, 25);
+        toBytesRound(sc, st, 8, 26);
+        toBytesRound(sc, st, 9, 25);
+
+        var ut: [10]u32 = undefined;
+        for (ut[0..]) |_, i| {
+            ut[i] = @bitCast(u32, @intCast(i32, t[i]));
+        }
+
+        writeInt(s[0..], (ut[0] >> 0) | (ut[1] << 26), Endian.Little);
+        writeInt(s[4..], (ut[1] >> 6) | (ut[2] << 19), Endian.Little);
+        writeInt(s[8..], (ut[2] >> 13) | (ut[3] << 13), Endian.Little);
+        writeInt(s[12..], (ut[3] >> 19) | (ut[4] << 6), Endian.Little);
+        writeInt(s[16..], (ut[5] >> 0) | (ut[6] << 25), Endian.Little);
+        writeInt(s[20..], (ut[6] >> 7) | (ut[7] << 19), Endian.Little);
+        writeInt(s[24..], (ut[7] >> 13) | (ut[8] << 12), Endian.Little);
+        writeInt(s[28..], (ut[8] >> 20) | (ut[9] << 6), Endian.Little);
+
+        std.mem.secureZero(i64, t[0..]);
+    }
+
+    //  Parity check.  Returns 0 if even, 1 if odd
+    fn isNegative(f: *const Fe) bool {
+        var s: [32]u8 = undefined;
+        Fe.toBytes(s[0..], f);
+        const isneg = s[0] & 1;
+        s.secureZero();
+        return isneg;
+    }
+
+    fn isNonZero(f: *const Fe) bool {
+        var s: [32]u8 = undefined;
+        Fe.toBytes(s[0..], f);
+        const isnonzero = zerocmp(u8, s[0..]);
+        s.secureZero();
+        return isneg;
+    }
+};
+
+test "x25519 rfc7748 vector1" {
+    const secret_key = "\xa5\x46\xe3\x6b\xf0\x52\x7c\x9d\x3b\x16\x15\x4b\x82\x46\x5e\xdd\x62\x14\x4c\x0a\xc1\xfc\x5a\x18\x50\x6a\x22\x44\xba\x44\x9a\xc4";
+    const public_key = "\xe6\xdb\x68\x67\x58\x30\x30\xdb\x35\x94\xc1\xa4\x24\xb1\x5f\x7c\x72\x66\x24\xec\x26\xb3\x35\x3b\x10\xa9\x03\xa6\xd0\xab\x1c\x4c";
+
+    const expected_output = "\xc3\xda\x55\x37\x9d\xe9\xc6\x90\x8e\x94\xea\x4d\xf2\x8d\x08\x4f\x32\xec\xcf\x03\x49\x1c\x71\xf7\x54\xb4\x07\x55\x77\xa2\x85\x52";
+
+    var output: [32]u8 = undefined;
+
+    std.debug.assert(X25519.create(output[0..], secret_key, public_key));
+    std.debug.assert(std.mem.eql(u8, output, expected_output));
+}
+
+test "x25519 rfc7748 vector2" {
+    const secret_key = "\x4b\x66\xe9\xd4\xd1\xb4\x67\x3c\x5a\xd2\x26\x91\x95\x7d\x6a\xf5\xc1\x1b\x64\x21\xe0\xea\x01\xd4\x2c\xa4\x16\x9e\x79\x18\xba\x0d";
+    const public_key = "\xe5\x21\x0f\x12\x78\x68\x11\xd3\xf4\xb7\x95\x9d\x05\x38\xae\x2c\x31\xdb\xe7\x10\x6f\xc0\x3c\x3e\xfc\x4c\xd5\x49\xc7\x15\xa4\x93";
+
+    const expected_output = "\x95\xcb\xde\x94\x76\xe8\x90\x7d\x7a\xad\xe4\x5c\xb4\xb8\x73\xf8\x8b\x59\x5a\x68\x79\x9f\xa1\x52\xe6\xf8\xf7\x64\x7a\xac\x79\x57";
+
+    var output: [32]u8 = undefined;
+
+    std.debug.assert(X25519.create(output[0..], secret_key, public_key));
+    std.debug.assert(std.mem.eql(u8, output, expected_output));
+}
+
+test "x25519 rfc7748 one iteration" {
+    const initial_value = "\x09\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
+    const expected_output = "\x42\x2c\x8e\x7a\x62\x27\xd7\xbc\xa1\x35\x0b\x3e\x2b\xb7\x27\x9f\x78\x97\xb8\x7b\xb6\x85\x4b\x78\x3c\x60\xe8\x03\x11\xae\x30\x79";
+
+    var k: [32]u8 = initial_value;
+    var u: [32]u8 = initial_value;
+
+    var i: usize = 0;
+    while (i < 1) : (i += 1) {
+        var output: [32]u8 = undefined;
+        std.debug.assert(X25519.create(output[0..], k, u));
+
+        std.mem.copy(u8, u[0..], k[0..]);
+        std.mem.copy(u8, k[0..], output[0..]);
+    }
+
+    std.debug.assert(std.mem.eql(u8, k[0..], expected_output));
+}
+
+test "x25519 rfc7748 1,000 iterations" {
+    // These iteration tests are slow so we always skip them. Results have been verified.
+    if (true) {
+        return error.SkipZigTest;
+    }
+
+    const initial_value = "\x09\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
+    const expected_output = "\x68\x4c\xf5\x9b\xa8\x33\x09\x55\x28\x00\xef\x56\x6f\x2f\x4d\x3c\x1c\x38\x87\xc4\x93\x60\xe3\x87\x5f\x2e\xb9\x4d\x99\x53\x2c\x51";
+
+    var k: [32]u8 = initial_value;
+    var u: [32]u8 = initial_value;
+
+    var i: usize = 0;
+    while (i < 1000) : (i += 1) {
+        var output: [32]u8 = undefined;
+        std.debug.assert(X25519.create(output[0..], k, u));
+
+        std.mem.copy(u8, u[0..], k[0..]);
+        std.mem.copy(u8, k[0..], output[0..]);
+    }
+
+    std.debug.assert(std.mem.eql(u8, k[0..], expected_output));
+}
+
+test "x25519 rfc7748 1,000,000 iterations" {
+    if (true) {
+        return error.SkipZigTest;
+    }
+
+    const initial_value = "\x09\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
+    const expected_output = "\x7c\x39\x11\xe0\xab\x25\x86\xfd\x86\x44\x97\x29\x7e\x57\x5e\x6f\x3b\xc6\x01\xc0\x88\x3c\x30\xdf\x5f\x4d\xd2\xd2\x4f\x66\x54\x24";
+
+    var k: [32]u8 = initial_value;
+    var u: [32]u8 = initial_value;
+
+    var i: usize = 0;
+    while (i < 1000000) : (i += 1) {
+        var output: [32]u8 = undefined;
+        std.debug.assert(X25519.create(output[0..], k, u));
+
+        std.mem.copy(u8, u[0..], k[0..]);
+        std.mem.copy(u8, k[0..], output[0..]);
+    }
+
+    std.debug.assert(std.mem.eql(u8, k[0..], expected_output));
+}