lib/std/simd.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452

//! This module provides functions for working conveniently with SIMD (Single Instruction; Multiple Data),
//! which may offer a potential boost in performance on some targets by performing the same operations on
//! multiple elements at once.
//! Please be aware that some functions are known to not work on MIPS.

const std = @import("std");
const builtin = @import("builtin");

pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?usize {
    // This is guesswork, if you have better suggestions can add it or edit the current here
    // This can run in comptime only, but stage 1 fails at it, stage 2 can understand it
    const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable);
    const vector_bit_size: u16 = blk: {
        if (cpu.arch.isX86()) {
            if (T == bool and std.Target.x86.featureSetHas(.prefer_mask_registers)) return 64;
            if (std.Target.x86.featureSetHas(cpu.features, .avx512f) and !std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .prefer_128_bit })) break :blk 512;
            if (std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .avx2 }) and !std.Target.x86.featureSetHas(cpu.features, .prefer_128_bit)) break :blk 256;
            if (std.Target.x86.featureSetHas(cpu.features, .sse)) break :blk 128;
            if (std.Target.x86.featureSetHasAny(cpu.features, .{ .mmx, .@"3dnow" })) break :blk 64;
        } else if (cpu.arch.isARM()) {
            if (std.Target.arm.featureSetHas(cpu.features, .neon)) break :blk 128;
        } else if (cpu.arch.isAARCH64()) {
            // SVE allows up to 2048 bits in the specification, as of 2022 the most powerful machine has implemented 512-bit
            // I think is safer to just be on 128 until is more common
            // TODO: Check on this return when bigger values are more common
            if (std.Target.aarch64.featureSetHas(cpu.features, .sve)) break :blk 128;
            if (std.Target.aarch64.featureSetHas(cpu.features, .neon)) break :blk 128;
        } else if (cpu.arch.isPPC() or cpu.arch.isPPC64()) {
            if (std.Target.powerpc.featureSetHas(cpu.features, .altivec)) break :blk 128;
        } else if (cpu.arch.isMIPS()) {
            if (std.Target.mips.featureSetHas(cpu.features, .msa)) break :blk 128;
            // TODO: Test MIPS capability to handle bigger vectors
            //       In theory MDMX and by extension mips3d have 32 registers of 64 bits which can use in parallel
            //       for multiple processing, but I don't know what's optimal here, if using
            //       the 2048 bits or using just 64 per vector or something in between
            if (std.Target.mips.featureSetHas(cpu.features, std.Target.mips.Feature.mips3d)) break :blk 64;
        } else if (cpu.arch.isRISCV()) {
            // in risc-v the Vector Extension allows configurable vector sizes, but a standard size of 128 is a safe estimate
            if (std.Target.riscv.featureSetHas(cpu.features, .v)) break :blk 128;
        } else if (cpu.arch.isSPARC()) {
            // TODO: Test Sparc capability to handle bigger vectors
            //       In theory Sparc have 32 registers of 64 bits which can use in parallel
            //       for multiple processing, but I don't know what's optimal here, if using
            //       the 2048 bits or using just 64 per vector or something in between
            if (std.Target.sparc.featureSetHasAny(cpu.features, .{ .vis, .vis2, .vis3 })) break :blk 64;
        }
        return null;
    };
    if (vector_bit_size <= element_bit_size) return null;

    return @divExact(vector_bit_size, element_bit_size);
}

/// Suggests a target-dependant vector size for a given type, or null if scalars are recommended.
/// Not yet implemented for every CPU architecture.
pub fn suggestVectorSize(comptime T: type) ?usize {
    return suggestVectorSizeForCpu(T, builtin.cpu);
}

test "suggestVectorSizeForCpu works with signed and unsigned values" {
    comptime var cpu = std.Target.Cpu.baseline(std.Target.Cpu.Arch.x86_64);
    comptime cpu.features.addFeature(@enumToInt(std.Target.x86.Feature.avx512f));
    const signed_integer_size = suggestVectorSizeForCpu(i32, cpu).?;
    const unsigned_integer_size = suggestVectorSizeForCpu(u32, cpu).?;
    try std.testing.expectEqual(@as(usize, 16), unsigned_integer_size);
    try std.testing.expectEqual(@as(usize, 16), signed_integer_size);
}

fn vectorLength(comptime VectorType: type) comptime_int {
    return switch (@typeInfo(VectorType)) {
        .Vector => |info| info.len,
        .Array => |info| info.len,
        else => @compileError("Invalid type " ++ @typeName(VectorType)),
    };
}

/// Returns the smallest type of unsigned ints capable of indexing any element within the given vector type.
pub fn VectorIndex(comptime VectorType: type) type {
    return std.math.IntFittingRange(0, vectorLength(VectorType) - 1);
}

/// Returns the smallest type of unsigned ints capable of holding the length of the given vector type.
pub fn VectorCount(comptime VectorType: type) type {
    return std.math.IntFittingRange(0, vectorLength(VectorType));
}

/// Returns a vector containing the first `len` integers in order from 0 to `len`-1.
/// For example, `iota(i32, 8)` will return a vector containing `.{0, 1, 2, 3, 4, 5, 6, 7}`.
pub inline fn iota(comptime T: type, comptime len: usize) @Vector(len, T) {
    comptime {
        var out: [len]T = undefined;
        for (&out, 0..) |*element, i| {
            element.* = switch (@typeInfo(T)) {
                .Int => @intCast(T, i),
                .Float => @intToFloat(T, i),
                else => @compileError("Can't use type " ++ @typeName(T) ++ " in iota."),
            };
        }
        return @as(@Vector(len, T), out);
    }
}

/// Returns a vector containing the same elements as the input, but repeated until the desired length is reached.
/// For example, `repeat(8, [_]u32{1, 2, 3})` will return a vector containing `.{1, 2, 3, 1, 2, 3, 1, 2}`.
pub fn repeat(comptime len: usize, vec: anytype) @Vector(len, std.meta.Child(@TypeOf(vec))) {
    const Child = std.meta.Child(@TypeOf(vec));

    return @shuffle(Child, vec, undefined, iota(i32, len) % @splat(len, @intCast(i32, vectorLength(@TypeOf(vec)))));
}

/// Returns a vector containing all elements of the first vector at the lower indices followed by all elements of the second vector
/// at the higher indices.
pub fn join(a: anytype, b: anytype) @Vector(vectorLength(@TypeOf(a)) + vectorLength(@TypeOf(b)), std.meta.Child(@TypeOf(a))) {
    const Child = std.meta.Child(@TypeOf(a));
    const a_len = vectorLength(@TypeOf(a));
    const b_len = vectorLength(@TypeOf(b));

    return @shuffle(Child, a, b, @as([a_len]i32, iota(i32, a_len)) ++ @as([b_len]i32, ~iota(i32, b_len)));
}

/// Returns a vector whose elements alternates between those of each input vector.
/// For example, `interlace(.{[4]u32{11, 12, 13, 14}, [4]u32{21, 22, 23, 24}})` returns a vector containing `.{11, 21, 12, 22, 13, 23, 14, 24}`.
pub fn interlace(vecs: anytype) @Vector(vectorLength(@TypeOf(vecs[0])) * vecs.len, std.meta.Child(@TypeOf(vecs[0]))) {
    // interlace doesn't work on MIPS, for some reason.
    // Notes from earlier debug attempt:
    //  The indices are correct. The problem seems to be with the @shuffle builtin.
    //  On MIPS, the test that interlaces small_base gives { 0, 2, 0, 0, 64, 255, 248, 200, 0, 0 }.
    //  Calling this with two inputs seems to work fine, but I'll let the compile error trigger for all inputs, just to be safe.
    comptime if (builtin.cpu.arch.isMIPS()) @compileError("TODO: Find out why interlace() doesn't work on MIPS");

    const VecType = @TypeOf(vecs[0]);
    const vecs_arr = @as([vecs.len]VecType, vecs);
    const Child = std.meta.Child(@TypeOf(vecs_arr[0]));

    if (vecs_arr.len == 1) return vecs_arr[0];

    const a_vec_count = (1 + vecs_arr.len) >> 1;
    const b_vec_count = vecs_arr.len >> 1;

    const a = interlace(@ptrCast(*const [a_vec_count]VecType, vecs_arr[0..a_vec_count]).*);
    const b = interlace(@ptrCast(*const [b_vec_count]VecType, vecs_arr[a_vec_count..]).*);

    const a_len = vectorLength(@TypeOf(a));
    const b_len = vectorLength(@TypeOf(b));
    const len = a_len + b_len;

    const indices = comptime blk: {
        const count_up = iota(i32, len);
        const cycle = @divFloor(count_up, @splat(len, @intCast(i32, vecs_arr.len)));
        const select_mask = repeat(len, join(@splat(a_vec_count, true), @splat(b_vec_count, false)));
        const a_indices = count_up - cycle * @splat(len, @intCast(i32, b_vec_count));
        const b_indices = shiftElementsRight(count_up - cycle * @splat(len, @intCast(i32, a_vec_count)), a_vec_count, 0);
        break :blk @select(i32, select_mask, a_indices, ~b_indices);
    };

    return @shuffle(Child, a, b, indices);
}

/// The contents of `interlaced` is evenly split between vec_count vectors that are returned as an array. They "take turns",
/// recieving one element from `interlaced` at a time.
pub fn deinterlace(
    comptime vec_count: usize,
    interlaced: anytype,
) [vec_count]@Vector(
    vectorLength(@TypeOf(interlaced)) / vec_count,
    std.meta.Child(@TypeOf(interlaced)),
) {
    const vec_len = vectorLength(@TypeOf(interlaced)) / vec_count;
    const Child = std.meta.Child(@TypeOf(interlaced));

    var out: [vec_count]@Vector(vec_len, Child) = undefined;

    comptime var i: usize = 0; // for-loops don't work for this, apparently.
    inline while (i < out.len) : (i += 1) {
        const indices = comptime iota(i32, vec_len) * @splat(vec_len, @intCast(i32, vec_count)) + @splat(vec_len, @intCast(i32, i));
        out[i] = @shuffle(Child, interlaced, undefined, indices);
    }

    return out;
}

pub fn extract(
    vec: anytype,
    comptime first: VectorIndex(@TypeOf(vec)),
    comptime count: VectorCount(@TypeOf(vec)),
) @Vector(count, std.meta.Child(@TypeOf(vec))) {
    const Child = std.meta.Child(@TypeOf(vec));
    const len = vectorLength(@TypeOf(vec));

    std.debug.assert(@intCast(comptime_int, first) + @intCast(comptime_int, count) <= len);

    return @shuffle(Child, vec, undefined, iota(i32, count) + @splat(count, @intCast(i32, first)));
}

test "vector patterns" {
    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .aarch64) {
        // https://github.com/ziglang/zig/issues/12012
        return error.SkipZigTest;
    }
    const base = @Vector(4, u32){ 10, 20, 30, 40 };
    const other_base = @Vector(4, u32){ 55, 66, 77, 88 };

    const small_bases = [5]@Vector(2, u8){
        @Vector(2, u8){ 0, 1 },
        @Vector(2, u8){ 2, 3 },
        @Vector(2, u8){ 4, 5 },
        @Vector(2, u8){ 6, 7 },
        @Vector(2, u8){ 8, 9 },
    };

    try std.testing.expectEqual([6]u32{ 10, 20, 30, 40, 10, 20 }, repeat(6, base));
    try std.testing.expectEqual([8]u32{ 10, 20, 30, 40, 55, 66, 77, 88 }, join(base, other_base));
    try std.testing.expectEqual([2]u32{ 20, 30 }, extract(base, 1, 2));

    if (comptime !builtin.cpu.arch.isMIPS()) {
        try std.testing.expectEqual([8]u32{ 10, 55, 20, 66, 30, 77, 40, 88 }, interlace(.{ base, other_base }));

        const small_braid = interlace(small_bases);
        try std.testing.expectEqual([10]u8{ 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 }, small_braid);
        try std.testing.expectEqual(small_bases, deinterlace(small_bases.len, small_braid));
    }
}

/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the size of a and b.
pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, b))) @TypeOf(a, b) {
    const len = vectorLength(@TypeOf(a, b));

    return extract(join(a, b), shift, len);
}

/// Elements are shifted rightwards (towards higher indices). New elements are added to the left, and the rightmost elements are cut off
/// so that the size of the vector stays the same.
pub fn shiftElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
    // It may be possible to implement shifts and rotates with a runtime-friendly slice of two joined vectors, as the length of the
    // slice would be comptime-known. This would permit vector shifts and rotates by a non-comptime-known amount.
    // However, I am unsure whether compiler optimizations would handle that well enough on all platforms.
    const len = vectorLength(@TypeOf(vec));

    return mergeShift(@splat(len, shift_in), vec, len - amount);
}

/// Elements are shifted leftwards (towards lower indices). New elements are added to the right, and the leftmost elements are cut off
/// so that no elements with indices below 0 remain.
pub fn shiftElementsLeft(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
    const len = vectorLength(@TypeOf(vec));

    return mergeShift(vec, @splat(len, shift_in), amount);
}

/// Elements are shifted leftwards (towards lower indices). Elements that leave to the left will reappear to the right in the same order.
pub fn rotateElementsLeft(vec: anytype, comptime amount: VectorCount(@TypeOf(vec))) @TypeOf(vec) {
    return mergeShift(vec, vec, amount);
}

/// Elements are shifted rightwards (towards higher indices). Elements that leave to the right will reappear to the left in the same order.
pub fn rotateElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec))) @TypeOf(vec) {
    return rotateElementsLeft(vec, vectorLength(@TypeOf(vec)) - amount);
}

pub fn reverseOrder(vec: anytype) @TypeOf(vec) {
    const Child = std.meta.Child(@TypeOf(vec));
    const len = vectorLength(@TypeOf(vec));

    return @shuffle(Child, vec, undefined, @splat(len, @intCast(i32, len) - 1) - iota(i32, len));
}

test "vector shifting" {
    const base = @Vector(4, u32){ 10, 20, 30, 40 };

    try std.testing.expectEqual([4]u32{ 30, 40, 999, 999 }, shiftElementsLeft(base, 2, 999));
    try std.testing.expectEqual([4]u32{ 999, 999, 10, 20 }, shiftElementsRight(base, 2, 999));
    try std.testing.expectEqual([4]u32{ 20, 30, 40, 10 }, rotateElementsLeft(base, 1));
    try std.testing.expectEqual([4]u32{ 40, 10, 20, 30 }, rotateElementsRight(base, 1));
    try std.testing.expectEqual([4]u32{ 40, 30, 20, 10 }, reverseOrder(base));
}

pub fn firstTrue(vec: anytype) ?VectorIndex(@TypeOf(vec)) {
    const len = vectorLength(@TypeOf(vec));
    const IndexInt = VectorIndex(@TypeOf(vec));

    if (!@reduce(.Or, vec)) {
        return null;
    }
    const indices = @select(IndexInt, vec, iota(IndexInt, len), @splat(len, ~@as(IndexInt, 0)));
    return @reduce(.Min, indices);
}

pub fn lastTrue(vec: anytype) ?VectorIndex(@TypeOf(vec)) {
    const len = vectorLength(@TypeOf(vec));
    const IndexInt = VectorIndex(@TypeOf(vec));

    if (!@reduce(.Or, vec)) {
        return null;
    }
    const indices = @select(IndexInt, vec, iota(IndexInt, len), @splat(len, @as(IndexInt, 0)));
    return @reduce(.Max, indices);
}

pub fn countTrues(vec: anytype) VectorCount(@TypeOf(vec)) {
    const len = vectorLength(@TypeOf(vec));
    const CountIntType = VectorCount(@TypeOf(vec));

    const one_if_true = @select(CountIntType, vec, @splat(len, @as(CountIntType, 1)), @splat(len, @as(CountIntType, 0)));
    return @reduce(.Add, one_if_true);
}

pub fn firstIndexOfValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) ?VectorIndex(@TypeOf(vec)) {
    const len = vectorLength(@TypeOf(vec));

    return firstTrue(vec == @splat(len, value));
}

pub fn lastIndexOfValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) ?VectorIndex(@TypeOf(vec)) {
    const len = vectorLength(@TypeOf(vec));

    return lastTrue(vec == @splat(len, value));
}

pub fn countElementsWithValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) VectorCount(@TypeOf(vec)) {
    const len = vectorLength(@TypeOf(vec));

    return countTrues(vec == @splat(len, value));
}

test "vector searching" {
    const base = @Vector(8, u32){ 6, 4, 7, 4, 4, 2, 3, 7 };

    try std.testing.expectEqual(@as(?u3, 1), firstIndexOfValue(base, 4));
    try std.testing.expectEqual(@as(?u3, 4), lastIndexOfValue(base, 4));
    try std.testing.expectEqual(@as(?u3, null), lastIndexOfValue(base, 99));
    try std.testing.expectEqual(@as(u4, 3), countElementsWithValue(base, 4));
}

/// Same as prefixScan, but with a user-provided, mathematically associative function.
pub fn prefixScanWithFunc(
    comptime hop: isize,
    vec: anytype,
    /// The error type that `func` might return. Set this to `void` if `func` doesn't return an error union.
    comptime ErrorType: type,
    comptime func: fn (@TypeOf(vec), @TypeOf(vec)) if (ErrorType == void) @TypeOf(vec) else ErrorType!@TypeOf(vec),
    /// When one operand of the operation performed by `func` is this value, the result must equal the other operand.
    /// For example, this should be 0 for addition or 1 for multiplication.
    comptime identity: std.meta.Child(@TypeOf(vec)),
) if (ErrorType == void) @TypeOf(vec) else ErrorType!@TypeOf(vec) {
    // I haven't debugged this, but it might be a cousin of sorts to what's going on with interlace.
    comptime if (builtin.cpu.arch.isMIPS()) @compileError("TODO: Find out why prefixScan doesn't work on MIPS");

    const len = vectorLength(@TypeOf(vec));

    if (hop == 0) @compileError("hop can not be 0; you'd be going nowhere forever!");
    const abs_hop = if (hop < 0) -hop else hop;

    var acc = vec;
    comptime var i = 0;
    inline while ((abs_hop << i) < len) : (i += 1) {
        const shifted = if (hop < 0) shiftElementsLeft(acc, abs_hop << i, identity) else shiftElementsRight(acc, abs_hop << i, identity);

        acc = if (ErrorType == void) func(acc, shifted) else try func(acc, shifted);
    }
    return acc;
}

/// Returns a vector whose elements are the result of performing the specified operation on the corresponding
/// element of the input vector and every hop'th element that came before it (or after, if hop is negative).
/// Supports the same operations as the @reduce() builtin. Takes O(logN) to compute.
/// The scan is not linear, which may affect floating point errors. This may affect the determinism of
/// algorithms that use this function.
pub fn prefixScan(comptime op: std.builtin.ReduceOp, comptime hop: isize, vec: anytype) @TypeOf(vec) {
    const VecType = @TypeOf(vec);
    const Child = std.meta.Child(VecType);
    const len = vectorLength(VecType);

    const identity = comptime switch (@typeInfo(Child)) {
        .Bool => switch (op) {
            .Or, .Xor => false,
            .And => true,
            else => @compileError("Invalid prefixScan operation " ++ @tagName(op) ++ " for vector of booleans."),
        },
        .Int => switch (op) {
            .Max => std.math.minInt(Child),
            .Add, .Or, .Xor => 0,
            .Mul => 1,
            .And, .Min => std.math.maxInt(Child),
        },
        .Float => switch (op) {
            .Max => -std.math.inf(Child),
            .Add => 0,
            .Mul => 1,
            .Min => std.math.inf(Child),
            else => @compileError("Invalid prefixScan operation " ++ @tagName(op) ++ " for vector of floats."),
        },
        else => @compileError("Invalid type " ++ @typeName(VecType) ++ " for prefixScan."),
    };

    const fn_container = struct {
        fn opFn(a: VecType, b: VecType) VecType {
            return if (Child == bool) switch (op) {
                .And => @select(bool, a, b, @splat(len, false)),
                .Or => @select(bool, a, @splat(len, true), b),
                .Xor => a != b,
                else => unreachable,
            } else switch (op) {
                .And => a & b,
                .Or => a | b,
                .Xor => a ^ b,
                .Add => a + b,
                .Mul => a * b,
                .Min => @min(a, b),
                .Max => @max(a, b),
            };
        }
    };

    return prefixScanWithFunc(hop, vec, void, fn_container.opFn, identity);
}

test "vector prefix scan" {
    if (comptime builtin.cpu.arch.isMIPS()) {
        return error.SkipZigTest;
    }

    if (builtin.zig_backend == .stage2_llvm) {
        // Regressed in LLVM 14:
        // https://github.com/llvm/llvm-project/issues/55522
        return error.SkipZigTest;
    }

    const int_base = @Vector(4, i32){ 11, 23, 9, -21 };
    const float_base = @Vector(4, f32){ 2, 0.5, -10, 6.54321 };
    const bool_base = @Vector(4, bool){ true, false, true, false };

    try std.testing.expectEqual(iota(u8, 32) + @splat(32, @as(u8, 1)), prefixScan(.Add, 1, @splat(32, @as(u8, 1))));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 3, 1, 1 }, prefixScan(.And, 1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 31, 31, -1 }, prefixScan(.Or, 1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 28, 21, -2 }, prefixScan(.Xor, 1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 34, 43, 22 }, prefixScan(.Add, 1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 253, 2277, -47817 }, prefixScan(.Mul, 1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 11, 9, -21 }, prefixScan(.Min, 1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 23, 23 }, prefixScan(.Max, 1, int_base));

    // Trying to predict all inaccuracies when adding and multiplying floats with prefixScans would be a mess, so we don't test those.
    try std.testing.expectEqual(@Vector(4, f32){ 2, 0.5, -10, -10 }, prefixScan(.Min, 1, float_base));
    try std.testing.expectEqual(@Vector(4, f32){ 2, 2, 2, 6.54321 }, prefixScan(.Max, 1, float_base));

    try std.testing.expectEqual(@Vector(4, bool){ true, true, false, false }, prefixScan(.Xor, 1, bool_base));
    try std.testing.expectEqual(@Vector(4, bool){ true, true, true, true }, prefixScan(.Or, 1, bool_base));
    try std.testing.expectEqual(@Vector(4, bool){ true, false, false, false }, prefixScan(.And, 1, bool_base));

    try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 20, 2 }, prefixScan(.Add, 2, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 22, 11, -12, -21 }, prefixScan(.Add, -1, int_base));
    try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 9, -10 }, prefixScan(.Add, 3, int_base));
}