aboutsummaryrefslogtreecommitdiff
path: root/src/codegen
diff options
context:
space:
mode:
authorMatthew Lugg <mlugg@mlugg.co.uk>2025-11-09 15:16:49 +0000
committerMatthew Lugg <mlugg@mlugg.co.uk>2025-11-12 16:00:16 +0000
commit69f39868b4125e79e4070a88bbdfcd3643dbc90d (patch)
tree7e90f08d2b5d1cb234957dbe0b6a48034afe4e30 /src/codegen
parent99a7884308d288bd39df9192c9094439b179ff60 (diff)
downloadzig-69f39868b4125e79e4070a88bbdfcd3643dbc90d.tar.gz
zig-69f39868b4125e79e4070a88bbdfcd3643dbc90d.zip
Air.Legalize: revert to loops for scalarizations
I had tried unrolling the loops to avoid requiring the `vector_store_elem` instruction, but it's arguably a problem to generate O(N) code for an operation on `@Vector(N, T)`. In addition, that lowering emitted a lot of `.aggregate_init` instructions, which is itself a quite difficult operation to codegen. This requires reintroducing runtime vector indexing internally. However, I've put it in a couple of instructions which are intended only for use by `Air.Legalize`, named `legalize_vec_elem_val` (like `array_elem_val`, but for indexing a vector with a runtime-known index) and `legalize_vec_store_elem` (like the old `vector_store_elem` instruction). These are explicitly documented as *not* being emitted by Sema, so need only be implemented by backends if they actually use an `Air.Legalize.Feature` which emits them (otherwise they can be marked as `unreachable`).
Diffstat (limited to 'src/codegen')
-rw-r--r--src/codegen/aarch64/Select.zig9
-rw-r--r--src/codegen/c.zig4
-rw-r--r--src/codegen/llvm.zig5
-rw-r--r--src/codegen/riscv64/CodeGen.zig5
-rw-r--r--src/codegen/sparc64/CodeGen.zig5
-rw-r--r--src/codegen/wasm/CodeGen.zig4
-rw-r--r--src/codegen/x86_64/CodeGen.zig630
7 files changed, 661 insertions, 1 deletions
diff --git a/src/codegen/aarch64/Select.zig b/src/codegen/aarch64/Select.zig
index 36ca69e589..64aeeb7ff4 100644
--- a/src/codegen/aarch64/Select.zig
+++ b/src/codegen/aarch64/Select.zig
@@ -134,6 +134,10 @@ pub fn analyze(isel: *Select, air_body: []const Air.Inst.Index) !void {
var air_inst_index = air_body[air_body_index];
const initial_def_order_len = isel.def_order.count();
air_tag: switch (air_tags[@intFromEnum(air_inst_index)]) {
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.arg,
.ret_addr,
.frame_addr,
@@ -950,6 +954,11 @@ pub fn body(isel: *Select, air_body: []const Air.Inst.Index) error{ OutOfMemory,
};
air_tag: switch (air.next().?) {
else => |air_tag| return isel.fail("unimplemented {t}", .{air_tag}),
+
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.arg => {
const arg_vi = isel.live_values.fetchRemove(air.inst_index).?.value;
defer arg_vi.deref(isel);
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index 0abea3d503..a19c4bb346 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -3325,6 +3325,10 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) Error!void {
// zig fmt: off
.inferred_alloc, .inferred_alloc_comptime => unreachable,
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.arg => try airArg(f, inst),
.breakpoint => try airBreakpoint(f),
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 1160c2958e..b862a23ddc 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -4886,6 +4886,11 @@ pub const FuncGen = struct {
const val: Builder.Value = switch (air_tags[@intFromEnum(inst)]) {
// zig fmt: off
+
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.add => try self.airAdd(inst, .normal),
.add_optimized => try self.airAdd(inst, .fast),
.add_wrap => try self.airAddWrap(inst),
diff --git a/src/codegen/riscv64/CodeGen.zig b/src/codegen/riscv64/CodeGen.zig
index bf5e5b6718..cdca3c2fd8 100644
--- a/src/codegen/riscv64/CodeGen.zig
+++ b/src/codegen/riscv64/CodeGen.zig
@@ -1391,6 +1391,11 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
const tag = air_tags[@intFromEnum(inst)];
switch (tag) {
// zig fmt: off
+
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.add,
.add_wrap,
.sub,
diff --git a/src/codegen/sparc64/CodeGen.zig b/src/codegen/sparc64/CodeGen.zig
index 684bfcfabb..4cbe07c762 100644
--- a/src/codegen/sparc64/CodeGen.zig
+++ b/src/codegen/sparc64/CodeGen.zig
@@ -479,6 +479,11 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
self.reused_operands = @TypeOf(self.reused_operands).initEmpty();
switch (air_tags[@intFromEnum(inst)]) {
// zig fmt: off
+
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.ptr_add => try self.airPtrArithmetic(inst, .ptr_add),
.ptr_sub => try self.airPtrArithmetic(inst, .ptr_sub),
diff --git a/src/codegen/wasm/CodeGen.zig b/src/codegen/wasm/CodeGen.zig
index b7f7aa151d..684513bf82 100644
--- a/src/codegen/wasm/CodeGen.zig
+++ b/src/codegen/wasm/CodeGen.zig
@@ -1786,6 +1786,10 @@ fn buildPointerOffset(cg: *CodeGen, ptr_value: WValue, offset: u64, action: enum
fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
const air_tags = cg.air.instructions.items(.tag);
return switch (air_tags[@intFromEnum(inst)]) {
+ // No "scalarize" legalizations are enabled, so these instructions never appear.
+ .legalize_vec_elem_val => unreachable,
+ .legalize_vec_store_elem => unreachable,
+
.inferred_alloc, .inferred_alloc_comptime => unreachable,
.add => cg.airBinOp(inst, .add),
diff --git a/src/codegen/x86_64/CodeGen.zig b/src/codegen/x86_64/CodeGen.zig
index 94394185bd..f0772dcd73 100644
--- a/src/codegen/x86_64/CodeGen.zig
+++ b/src/codegen/x86_64/CodeGen.zig
@@ -103926,7 +103926,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
try ops[0].toOffset(0, cg);
try ops[0].finish(inst, &.{ty_op.operand}, &ops, cg);
},
- .array_elem_val => {
+ .array_elem_val, .legalize_vec_elem_val => {
const bin_op = air_datas[@intFromEnum(inst)].bin_op;
const array_ty = cg.typeOf(bin_op.lhs);
const res_ty = array_ty.elemType2(zcu);
@@ -173061,6 +173061,634 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
.c_va_copy => try cg.airVaCopy(inst),
.c_va_end => try cg.airVaEnd(inst),
.c_va_start => try cg.airVaStart(inst),
+ .legalize_vec_store_elem => {
+ const pl_op = air_datas[@intFromEnum(inst)].pl_op;
+ const bin = cg.air.extraData(Air.Bin, pl_op.payload).data;
+ // vector_ptr, index, elem_val
+ var ops = try cg.tempsFromOperands(inst, .{ pl_op.operand, bin.lhs, bin.rhs });
+ cg.select(&.{}, &.{}, &ops, comptime &.{ .{
+ .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } },
+ },
+ .extra_temps = .{
+ .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+ .{ ._, ._r, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } },
+ },
+ .extra_temps = .{
+ .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+ .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .cmov, null, null, null },
+ .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .extra_temps = .{
+ .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+ .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+ .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
+ .{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ },
+ .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
+ .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .extra_temps = .{
+ .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+ .{ ._, ._r, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+ .{ .@"0:", ._s, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ .@"1:", ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } },
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._r, .bt, .lea(.src0w), .src1w, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } },
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._s, .bt, .lea(.src0d), .src1d, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .cmov, null, null, null },
+ .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .extra_temps = .{
+ .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+ .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .movzx, .tmp0d, .lea(.src0w), ._, ._ },
+ .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
+ .{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ },
+ .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
+ .{ ._, ._, .mov, .lea(.src0w), .tmp0w, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+ .{ ._, ._r, .bt, .lea(.src0w), .src1w, ._, ._ },
+ .{ ._, ._mp, .j, .@"0f", ._, ._, ._ },
+ .{ .@"1:", ._s, .bt, .lea(.src0w), .src1w, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } },
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._r, .bt, .lea(.src0d), .src1d, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } },
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._s, .bt, .lea(.src0d), .src1d, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .cmov, null, null, null },
+ .src_constraints = .{ .{ .ptr_bool_vec = .dword }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .extra_temps = .{
+ .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+ .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .tmp0d, .lea(.src0d), ._, ._ },
+ .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
+ .{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ },
+ .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
+ .{ ._, ._, .mov, .lea(.src0d), .tmp0d, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .@"64bit", .cmov, null, null },
+ .src_constraints = .{ .{ .ptr_bool_vec = .qword }, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .extra_temps = .{
+ .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+ .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .tmp0q, .lea(.src0q), ._, ._ },
+ .{ ._, ._, .mov, .tmp1q, .tmp0q, ._, ._ },
+ .{ ._, ._r, .bt, .tmp1q, .src1q, ._, ._ },
+ .{ ._, ._s, .bt, .tmp0q, .src1q, ._, ._ },
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._z, .cmov, .tmp0q, .tmp1q, ._, ._ },
+ .{ ._, ._, .mov, .lea(.src0q), .tmp0q, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .cmov, null, null, null },
+ .src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .extra_temps = .{
+ .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+ .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+ .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .tmp0d, .src1d, ._, ._ },
+ .{ ._, ._r, .sh, .tmp0d, .ui(5), ._, ._ },
+ .{ ._, ._, .mov, .tmp1d, .leasi(.src0d, .@"4", .tmp0), ._, ._ },
+ .{ ._, ._, .mov, .tmp2d, .tmp1d, ._, ._ },
+ .{ ._, ._r, .bt, .tmp2d, .src1d, ._, ._ },
+ .{ ._, ._s, .bt, .tmp1d, .src1d, ._, ._ },
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._z, .cmov, .tmp1d, .tmp2d, ._, ._ },
+ .{ ._, ._, .mov, .leasi(.src0d, .@"4", .tmp0), .tmp1d, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .clobbers = .{ .eflags = true },
+ .each = .{ .once = &.{
+ .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
+ .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+ .{ ._, ._r, .bt, .lea(.src0d), .src1d, ._, ._ },
+ .{ ._, ._mp, .j, .@"0f", ._, ._, ._ },
+ .{ .@"1:", ._s, .bt, .lea(.src0d), .src1d, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .any, .any, .{ .int = .byte } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .imm8 } },
+ .{ .src = .{ .to_gpr, .simm32, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leaa(.src0b, .add_src0_elem_size_mul_src1), .src2b, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .any, .any, .{ .int = .byte } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .imm8 } },
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leai(.src0b, .src1), .src2b, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .any, .any, .{ .int = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .imm16 } },
+ .{ .src = .{ .to_gpr, .simm32, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2w, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .any, .any, .{ .int = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .imm16 } },
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .src2w, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .avx, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .vp_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse4_1, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .p_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse2, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .extra_temps = .{
+ .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .each = .{ .once = &.{
+ .{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ },
+ .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp0w, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .extra_temps = .{
+ .{ .type = .f32, .kind = .mem },
+ .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ },
+ .{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ },
+ .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp1w, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .avx, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .vp_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse4_1, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .p_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse2, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .extra_temps = .{
+ .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .each = .{ .once = &.{
+ .{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ },
+ .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp0w, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .word } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .extra_temps = .{
+ .{ .type = .f32, .kind = .mem },
+ .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ .unused,
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ },
+ .{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ },
+ .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp1w, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .any, .any, .{ .int = .dword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .imm32 } },
+ .{ .src = .{ .to_gpr, .simm32, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2d, ._, ._ },
+ } },
+ }, .{
+ .src_constraints = .{ .any, .any, .{ .int = .dword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .imm32 } },
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leasi(.src0d, .@"4", .src1), .src2d, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .avx, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .dword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .v_ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .dword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .avx, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .dword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .v_ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .dword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .@"64bit", null, null, null },
+ .src_constraints = .{ .any, .any, .{ .int = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .simm32 } },
+ .{ .src = .{ .to_gpr, .simm32, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2q, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .@"64bit", null, null, null },
+ .src_constraints = .{ .any, .any, .{ .int = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .simm32 } },
+ .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._, .mov, .leasi(.src0q, .@"8", .src1), .src2q, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .avx, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .v_sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse2, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._ps, .movl, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .avx, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, .v_sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse2, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
+ } },
+ }, .{
+ .required_features = .{ .sse, null, null, null },
+ .src_constraints = .{ .any, .any, .{ .float = .qword } },
+ .patterns = &.{
+ .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+ },
+ .each = .{ .once = &.{
+ .{ ._, ._ps, .movl, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
+ } },
+ } }) catch |err| switch (err) {
+ error.SelectFailed => {
+ const elem_size = cg.typeOf(bin.rhs).abiSize(zcu);
+ while (try ops[0].toRegClass(true, .general_purpose, cg) or
+ try ops[1].toRegClass(true, .general_purpose, cg))
+ {}
+ const base_reg = ops[0].tracking(cg).short.register.to64();
+ const rhs_reg = ops[1].tracking(cg).short.register.to64();
+ if (!std.math.isPowerOfTwo(elem_size)) {
+ try cg.spillEflagsIfOccupied();
+ try cg.asmRegisterRegisterImmediate(
+ .{ .i_, .mul },
+ rhs_reg,
+ rhs_reg,
+ .u(elem_size),
+ );
+ try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
+ .base = .{ .reg = base_reg },
+ .mod = .{ .rm = .{ .index = rhs_reg } },
+ });
+ } else if (elem_size > 8) {
+ try cg.spillEflagsIfOccupied();
+ try cg.asmRegisterImmediate(
+ .{ ._l, .sh },
+ rhs_reg,
+ .u(std.math.log2_int(u64, elem_size)),
+ );
+ try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
+ .base = .{ .reg = base_reg },
+ .mod = .{ .rm = .{ .index = rhs_reg } },
+ });
+ } else try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
+ .base = .{ .reg = base_reg },
+ .mod = .{ .rm = .{
+ .index = rhs_reg,
+ .scale = .fromFactor(@intCast(elem_size)),
+ } },
+ });
+ try ops[0].store(&ops[2], .{}, cg);
+ },
+ else => |e| return e,
+ };
+ for (ops) |op| try op.die(cg);
+ },
.work_item_id, .work_group_size, .work_group_id => unreachable,
}
try cg.resetTemps(@enumFromInt(0));