diff options
| author | Dan Ellis Echavarria <19101das@gmail.com> | 2022-07-17 02:20:35 -0500 |
|---|---|---|
| committer | Veikka Tuominen <git@vexu.eu> | 2022-07-22 18:58:43 +0300 |
| commit | d1d892c83ca7beaf235147341b7e68d3619dd829 (patch) | |
| tree | 8b920ee96faff0954d7c29da8144bc9fa906a36e /lib/std/simd.zig | |
| parent | 8e75ba653b03477229cf72211e8a8bfe7b071254 (diff) | |
| download | zig-d1d892c83ca7beaf235147341b7e68d3619dd829.tar.gz zig-d1d892c83ca7beaf235147341b7e68d3619dd829.zip | |
SIMD size suggestions: suggestions code now compiles, added more
architectures
The idea behind this is using the register capabilities in safe amounts,
there is still some consideration to be done.
+ Fixed compile error using std.Target.<arch>.featureSetHas
+ X86 MMX and "3DNOW" 64 bits register usage considered for vector size
+ Added ARM Neon recommened usage of 128 bits (The size of the register)
+ Added AARCH64 Neon and SVE for 128 bits. SVE could use in theory up to
2048 bits, but found only evidence of functional 512 bits on a super
computer, decided on using 128 bits as a safety
+ Added Altivec recommendation of using the 128 bits long register
+ Using MIPS msa 2x64bits capabilities, usage of 64 bits registers for MDMX
systems, need testing on how using bigger values affect performance
+ Using V extension on RISC-V, which is extendable via instructions, decided on 128 bits
as a value to not use all registers
+ in SPARC the 64 bits registers are used, a max of 32 registers
are to be used for configurable simd instructions, decided on using
the size of the register, need testing on performance hit on using a
bigger sized register vector size
Diffstat (limited to 'lib/std/simd.zig')
| -rw-r--r-- | lib/std/simd.zig | 60 |
1 files changed, 41 insertions, 19 deletions
diff --git a/lib/std/simd.zig b/lib/std/simd.zig index 1a71bd7069..b2655758c0 100644 --- a/lib/std/simd.zig +++ b/lib/std/simd.zig @@ -6,27 +6,49 @@ const std = @import("std"); const builtin = @import("builtin"); -pub fn suggestVectorSizeForCpu(comptime T: type, cpu: std.Target.Cpu) ?usize { - switch (cpu.arch) { - .x86_64 => { - // Note: This is mostly just guesswork. It'd be great if someone more qualified were to take a - // proper look at this. - +pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?usize { + // This is guesswork, if you have better suggestions can add it or edit the current here + // This can run in comptime only, but stage 1 fails at it, stage 2 can understand it + const element_bit_size = @maximum(8, std.math.ceilPowerOfTwo(T, @bitSizeOf(T)) catch unreachable); + const vector_bit_size: u16 = blk: { + if (cpu.arch.isX86()) { if (T == bool and std.Target.x86.featureSetHas(.prefer_mask_registers)) return 64; + if (std.Target.x86.featureSetHas(cpu.features, .avx512f) and !std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .prefer_128_bit })) break :blk 512; + if (std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .avx2 }) and !std.Target.x86.featureSetHas(cpu.features, .prefer_128_bit)) break :blk 256; + if (std.Target.x86.featureSetHas(cpu.features, .sse)) break :blk 128; + if (std.Target.x86.featureSetHasAny(cpu.features, .{ .mmx, .@"3dnow" })) break :blk 64; + } else if (cpu.arch.isARM()) { + if (std.Target.arm.featureSetHas(cpu.features, .neon)) break :blk 128; + } else if (cpu.arch.isAARCH64()) { + // SVE allows up to 2048 bits in the specification, as of 2022 the most powerful machine has implemented 512-bit + // I think is safer to just be on 128 until is more common + // TODO: Check on this return when bigger values are more common + if (std.Target.aarch64.featureSetHas(cpu.features, .sve)) break :blk 128; + if (std.Target.aarch64.featureSetHas(cpu.features, .neon)) break :blk 128; + } else if (cpu.arch.isPPC() or cpu.arch.isPPC64()) { + if (std.Target.powerpc.featureSetHas(cpu.features, .altivec)) break :blk 128; + } else if (cpu.arch.isMIPS()) { + if (std.Target.mips.featureSetHas(cpu.features, .msa)) break :blk 128; + // TODO: Test MIPS capability to handle bigger vectors + // In theory MDMX and by extension mips3d have 32 registers of 64 bits which can use in parallel + // for multiple processing, but I don't know what's optimal here, if using + // the 2048 bits or using just 64 per vector or something in between + if (std.Target.mips.featureSetHas(cpu.features, std.Target.mips.Feature.mips3d)) break :blk 64; + } else if (cpu.arch.isRISCV()) { + // in risc-v the Vector Extension allows configurable vector sizes, but a standard size of 128 is a safe estimate + if (std.Target.riscv.featureSetHas(cpu.features, .v)) break :blk 128; + } else if (cpu.arch.isSPARC()) { + // TODO: Test Sparc capability to handle bigger vectors + // In theory Sparc have 32 registers of 64 bits which can use in parallel + // for multiple processing, but I don't know what's optimal here, if using + // the 2048 bits or using just 64 per vector or something in between + if (std.Target.sparc.featureSetHasAny(cpu.features, .{ .vis, .vis2, .vis3 })) break :blk 64; + } + return null; + }; + if (vector_bit_size <= element_bit_size) return null; - const vector_bit_size = blk: { - if (std.Target.x86.featureSetHas(.avx512f)) break :blk 512; - if (std.Target.x86.featureSetHas(.prefer_256_bit)) break :blk 256; - if (std.Target.x86.featureSetHas(.prefer_128_bit)) break :blk 128; - return null; - }; - const element_bit_size = std.math.max(8, std.math.ceilPowerOfTwo(T, @bitSizeOf(T))); - return @divExact(vector_bit_size, element_bit_size); - }, - else => { - return null; - }, - } + return @divExact(vector_bit_size, element_bit_size); } /// Suggests a target-dependant vector size for a given type, or null if scalars are recommended. |
