From b483defc5a5c2f93eb8a445974ab831ae4e4b321 Mon Sep 17 00:00:00 2001
From: Jacob Young <jacobly0@users.noreply.github.com>
Date: Fri, 30 May 2025 00:22:45 -0400
Subject: Legalize: implement scalarization of binary operations

---
 src/codegen/c.zig      | 12 ++++++------
 src/codegen/c/Type.zig | 21 +++++++++++++++------
 src/codegen/llvm.zig   |  8 +++++++-
 3 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'src/codegen')

diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index 8539efdbfe..d83eb8f771 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -1591,7 +1591,7 @@ pub const DeclGen = struct {
                         try writer.writeAll("((");
                         try dg.renderCType(writer, ctype);
                         return writer.print("){x})", .{
-                            try dg.fmtIntLiteral(try pt.undefValue(.usize), .Other),
+                            try dg.fmtIntLiteral(.undef_usize, .Other),
                         });
                     },
                     .slice => {
@@ -1605,7 +1605,7 @@ pub const DeclGen = struct {
                         const ptr_ty = ty.slicePtrFieldType(zcu);
                         try dg.renderType(writer, ptr_ty);
                         return writer.print("){x}, {0x}}}", .{
-                            try dg.fmtIntLiteral(try dg.pt.undefValue(.usize), .Other),
+                            try dg.fmtIntLiteral(.undef_usize, .Other),
                         });
                     },
                 },
@@ -6376,7 +6376,7 @@ fn airArrayToSlice(f: *Function, inst: Air.Inst.Index) !CValue {
             if (operand_child_ctype.info(ctype_pool) == .array) {
                 try writer.writeByte('&');
                 try f.writeCValueDeref(writer, operand);
-                try writer.print("[{}]", .{try f.fmtIntLiteral(try pt.intValue(.usize, 0))});
+                try writer.print("[{}]", .{try f.fmtIntLiteral(.zero_usize)});
             } else try f.writeCValue(writer, operand, .Other);
         }
         try a.end(f, writer);
@@ -6907,7 +6907,7 @@ fn airMemset(f: *Function, inst: Air.Inst.Index, safety: bool) !CValue {
         try writer.writeAll("for (");
         try f.writeCValue(writer, index, .Other);
         try writer.writeAll(" = ");
-        try f.object.dg.renderValue(writer, try pt.intValue(.usize, 0), .Other);
+        try f.object.dg.renderValue(writer, .zero_usize, .Other);
         try writer.writeAll("; ");
         try f.writeCValue(writer, index, .Other);
         try writer.writeAll(" != ");
@@ -8311,11 +8311,11 @@ const Vectorize = struct {
 
             try writer.writeAll("for (");
             try f.writeCValue(writer, local, .Other);
-            try writer.print(" = {d}; ", .{try f.fmtIntLiteral(try pt.intValue(.usize, 0))});
+            try writer.print(" = {d}; ", .{try f.fmtIntLiteral(.zero_usize)});
             try f.writeCValue(writer, local, .Other);
             try writer.print(" < {d}; ", .{try f.fmtIntLiteral(try pt.intValue(.usize, ty.vectorLen(zcu)))});
             try f.writeCValue(writer, local, .Other);
-            try writer.print(" += {d}) {{\n", .{try f.fmtIntLiteral(try pt.intValue(.usize, 1))});
+            try writer.print(" += {d}) {{\n", .{try f.fmtIntLiteral(.one_usize)});
             f.object.indent_writer.pushIndent();
 
             break :index .{ .index = local };
diff --git a/src/codegen/c/Type.zig b/src/codegen/c/Type.zig
index 7d3a485e2a..e5901ec626 100644
--- a/src/codegen/c/Type.zig
+++ b/src/codegen/c/Type.zig
@@ -1408,6 +1408,15 @@ pub const Pool = struct {
                 .bits = pt.zcu.errorSetBits(),
             }, mod, kind),
 
+            .ptr_usize_type,
+            => return pool.getPointer(allocator, .{
+                .elem_ctype = .usize,
+            }),
+            .ptr_const_comptime_int_type,
+            => return pool.getPointer(allocator, .{
+                .elem_ctype = .void,
+                .@"const" = true,
+            }),
             .manyptr_u8_type,
             => return pool.getPointer(allocator, .{
                 .elem_ctype = .u8,
@@ -1418,11 +1427,6 @@ pub const Pool = struct {
                 .elem_ctype = .u8,
                 .@"const" = true,
             }),
-            .single_const_pointer_to_comptime_int_type,
-            => return pool.getPointer(allocator, .{
-                .elem_ctype = .void,
-                .@"const" = true,
-            }),
             .slice_const_u8_type,
             .slice_const_u8_sentinel_0_type,
             => {
@@ -2157,11 +2161,16 @@ pub const Pool = struct {
             },
 
             .undef,
+            .undef_bool,
+            .undef_usize,
+            .undef_u1,
             .zero,
             .zero_usize,
+            .zero_u1,
             .zero_u8,
             .one,
             .one_usize,
+            .one_u1,
             .one_u8,
             .four_u8,
             .negative_one,
@@ -2172,7 +2181,7 @@ pub const Pool = struct {
             .bool_false,
             .empty_tuple,
             .none,
-            => unreachable,
+            => unreachable, // values, not types
 
             _ => |ip_index| switch (ip.indexToKey(ip_index)) {
                 .int_type => |int_info| return pool.fromIntInfo(allocator, int_info, mod, kind),
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index d2a72502ed..1820faf90c 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -3081,10 +3081,11 @@ pub const Object = struct {
             .undefined_type,
             .enum_literal_type,
             => unreachable,
+            .ptr_usize_type,
+            .ptr_const_comptime_int_type,
             .manyptr_u8_type,
             .manyptr_const_u8_type,
             .manyptr_const_u8_sentinel_0_type,
-            .single_const_pointer_to_comptime_int_type,
             => .ptr,
             .slice_const_u8_type,
             .slice_const_u8_sentinel_0_type,
@@ -3098,11 +3099,16 @@ pub const Object = struct {
             => unreachable,
             // values, not types
             .undef,
+            .undef_bool,
+            .undef_usize,
+            .undef_u1,
             .zero,
             .zero_usize,
+            .zero_u1,
             .zero_u8,
             .one,
             .one_usize,
+            .one_u1,
             .one_u8,
             .four_u8,
             .negative_one,
-- 
cgit v1.2.3


From 6198f7afb76b7a5a6d359bfd24f8fbdabc77939b Mon Sep 17 00:00:00 2001
From: Jacob Young <jacobly0@users.noreply.github.com>
Date: Fri, 30 May 2025 12:13:18 -0400
Subject: Sema: remove `all_vector_instructions` logic

Backends can instead ask legalization on a per-instruction basis.
---
 src/Air/Legalize.zig         |  19 +++--
 src/Sema.zig                 | 187 +++++++++++--------------------------------
 src/Zcu.zig                  |   9 ---
 src/arch/aarch64/CodeGen.zig |  29 +++++--
 src/arch/arm/CodeGen.zig     |  27 ++++++-
 src/arch/powerpc/CodeGen.zig |   4 +
 src/arch/riscv64/CodeGen.zig |  21 ++++-
 src/arch/sparc64/CodeGen.zig |  27 ++++++-
 src/arch/wasm/CodeGen.zig    |  31 +++++--
 src/arch/x86_64/CodeGen.zig  |   4 +-
 src/codegen.zig              |   9 +--
 src/codegen/c.zig            |   8 +-
 src/codegen/llvm.zig         |  30 +++++--
 src/codegen/spirv.zig        |  12 +++
 src/target.zig               |   4 -
 15 files changed, 224 insertions(+), 197 deletions(-)

(limited to 'src/codegen')

diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig
index 0e659bea06..870270f089 100644
--- a/src/Air/Legalize.zig
+++ b/src/Air/Legalize.zig
@@ -42,6 +42,7 @@ pub const Feature = enum {
     scalarize_shl_sat,
     scalarize_xor,
     scalarize_not,
+    scalarize_bitcast,
     scalarize_clz,
     scalarize_ctz,
     scalarize_popcount,
@@ -76,7 +77,7 @@ pub const Feature = enum {
     scalarize_mul_add,
 
     /// Legalize (shift lhs, (splat rhs)) -> (shift lhs, rhs)
-    remove_shift_vector_rhs_splat,
+    unsplat_shift_rhs,
     /// Legalize reduce of a one element vector to a bitcast
     reduce_one_elem_to_bitcast,
 
@@ -121,6 +122,7 @@ pub const Feature = enum {
             .shl_sat => .scalarize_shl_sat,
             .xor => .scalarize_xor,
             .not => .scalarize_not,
+            .bitcast => .scalarize_bitcast,
             .clz => .scalarize_clz,
             .ctz => .scalarize_ctz,
             .popcount => .scalarize_popcount,
@@ -259,9 +261,7 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
             => |air_tag| done: {
                 const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
                 if (!l.typeOf(bin_op.rhs).isVector(zcu)) break :done;
-                if (l.features.contains(comptime .scalarize(air_tag))) {
-                    continue :inst try l.scalarize(inst, .bin_op);
-                } else if (l.features.contains(.remove_shift_vector_rhs_splat)) {
+                if (l.features.contains(.unsplat_shift_rhs)) {
                     if (bin_op.rhs.toInterned()) |rhs_ip_index| switch (ip.indexToKey(rhs_ip_index)) {
                         else => {},
                         .aggregate => |aggregate| switch (aggregate.storage) {
@@ -282,6 +282,7 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                         }
                     }
                 }
+                if (l.features.contains(comptime .scalarize(air_tag))) continue :inst try l.scalarize(inst, .bin_op);
             },
             inline .not,
             .clz,
@@ -302,8 +303,14 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                 const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
                 if (ty_op.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .ty_op);
             },
-            .bitcast,
-            => {},
+            inline .bitcast,
+            => |air_tag| if (l.features.contains(comptime .scalarize(air_tag))) {
+                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
+                const to_ty = ty_op.ty.toType();
+                const from_ty = l.typeOf(ty_op.operand);
+                if (to_ty.isVector(zcu) and from_ty.isVector(zcu) and to_ty.vectorLen(zcu) == from_ty.vectorLen(zcu))
+                    continue :inst try l.scalarize(inst, .ty_op);
+            },
             .block,
             .loop,
             => {
diff --git a/src/Sema.zig b/src/Sema.zig
index 34c1bb4df7..c18f71d1fa 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -10165,16 +10165,7 @@ fn zirIntFromPtr(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
     try sema.requireRuntimeBlock(block, block.nodeOffset(inst_data.src_node), ptr_src);
     try sema.validateRuntimeValue(block, ptr_src, operand);
     try sema.checkLogicalPtrOperation(block, ptr_src, ptr_ty);
-    if (!is_vector or zcu.backendSupportsFeature(.all_vector_instructions)) {
-        return block.addBitCast(dest_ty, operand);
-    }
-    const new_elems = try sema.arena.alloc(Air.Inst.Ref, len);
-    for (new_elems, 0..) |*new_elem, i| {
-        const idx_ref = try pt.intRef(.usize, i);
-        const old_elem = try block.addBinOp(.array_elem_val, operand, idx_ref);
-        new_elem.* = try block.addBitCast(.usize, old_elem);
-    }
-    return block.addAggregateInit(dest_ty, new_elems);
+    return block.addBitCast(dest_ty, operand);
 }
 
 fn zirFieldVal(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -10640,17 +10631,7 @@ fn zirFloatCast(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     if (dst_bits >= src_bits) {
         return sema.coerce(block, dest_ty, operand, operand_src);
     }
-    if (!is_vector or zcu.backendSupportsFeature(.all_vector_instructions)) {
-        return block.addTyOp(.fptrunc, dest_ty, operand);
-    }
-    const vec_len = operand_ty.vectorLen(zcu);
-    const new_elems = try sema.arena.alloc(Air.Inst.Ref, vec_len);
-    for (new_elems, 0..) |*new_elem, i| {
-        const idx_ref = try pt.intRef(.usize, i);
-        const old_elem = try block.addBinOp(.array_elem_val, operand, idx_ref);
-        new_elem.* = try block.addTyOp(.fptrunc, dest_scalar_ty, old_elem);
-    }
-    return block.addAggregateInit(dest_ty, new_elems);
+    return block.addTyOp(.fptrunc, dest_ty, operand);
 }
 
 fn zirElemVal(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -20722,16 +20703,7 @@ fn zirIntFromBool(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
             .storage = .{ .elems = new_elems },
         } }));
     }
-    if (!is_vector or zcu.backendSupportsFeature(.all_vector_instructions)) {
-        return block.addBitCast(dest_ty, operand);
-    }
-    const new_elems = try sema.arena.alloc(Air.Inst.Ref, len);
-    for (new_elems, 0..) |*new_elem, i| {
-        const idx_ref = try pt.intRef(.usize, i);
-        const old_elem = try block.addBinOp(.array_elem_val, operand, idx_ref);
-        new_elem.* = try block.addBitCast(.u1, old_elem);
-    }
-    return block.addAggregateInit(dest_ty, new_elems);
+    return block.addBitCast(dest_ty, operand);
 }
 
 fn zirErrorName(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -22327,42 +22299,23 @@ fn zirIntFromFloat(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileErro
             .storage = .{ .repeated_elem = (try pt.intValue(dest_scalar_ty, 0)).toIntern() },
         } }));
     }
-    if (!is_vector or zcu.backendSupportsFeature(.all_vector_instructions)) {
-        const result = try block.addTyOp(if (block.float_mode == .optimized) .int_from_float_optimized else .int_from_float, dest_ty, operand);
-        if (block.wantSafety()) {
-            const back = try block.addTyOp(.float_from_int, operand_ty, result);
-            const diff = try block.addBinOp(if (block.float_mode == .optimized) .sub_optimized else .sub, operand, back);
-            const ok = if (is_vector) ok: {
-                const ok_pos = try block.addCmpVector(diff, Air.internedToRef((try sema.splat(operand_ty, try pt.floatValue(operand_scalar_ty, 1.0))).toIntern()), .lt);
-                const ok_neg = try block.addCmpVector(diff, Air.internedToRef((try sema.splat(operand_ty, try pt.floatValue(operand_scalar_ty, -1.0))).toIntern()), .gt);
-                const ok = try block.addBinOp(.bit_and, ok_pos, ok_neg);
-                break :ok try block.addReduce(ok, .And);
-            } else ok: {
-                const ok_pos = try block.addBinOp(if (block.float_mode == .optimized) .cmp_lt_optimized else .cmp_lt, diff, Air.internedToRef((try pt.floatValue(operand_ty, 1.0)).toIntern()));
-                const ok_neg = try block.addBinOp(if (block.float_mode == .optimized) .cmp_gt_optimized else .cmp_gt, diff, Air.internedToRef((try pt.floatValue(operand_ty, -1.0)).toIntern()));
-                break :ok try block.addBinOp(.bool_and, ok_pos, ok_neg);
-            };
-            try sema.addSafetyCheck(block, src, ok, .integer_part_out_of_bounds);
-        }
-        return result;
-    }
-    const len = dest_ty.vectorLen(zcu);
-    const new_elems = try sema.arena.alloc(Air.Inst.Ref, len);
-    for (new_elems, 0..) |*new_elem, i| {
-        const idx_ref = try pt.intRef(.usize, i);
-        const old_elem = try block.addBinOp(.array_elem_val, operand, idx_ref);
-        const result = try block.addTyOp(if (block.float_mode == .optimized) .int_from_float_optimized else .int_from_float, dest_scalar_ty, old_elem);
-        if (block.wantSafety()) {
-            const back = try block.addTyOp(.float_from_int, operand_scalar_ty, result);
-            const diff = try block.addBinOp(.sub, old_elem, back);
-            const ok_pos = try block.addBinOp(if (block.float_mode == .optimized) .cmp_lt_optimized else .cmp_lt, diff, Air.internedToRef((try pt.floatValue(operand_scalar_ty, 1.0)).toIntern()));
-            const ok_neg = try block.addBinOp(if (block.float_mode == .optimized) .cmp_gt_optimized else .cmp_gt, diff, Air.internedToRef((try pt.floatValue(operand_scalar_ty, -1.0)).toIntern()));
-            const ok = try block.addBinOp(.bool_and, ok_pos, ok_neg);
-            try sema.addSafetyCheck(block, src, ok, .integer_part_out_of_bounds);
-        }
-        new_elem.* = result;
+    const result = try block.addTyOp(if (block.float_mode == .optimized) .int_from_float_optimized else .int_from_float, dest_ty, operand);
+    if (block.wantSafety()) {
+        const back = try block.addTyOp(.float_from_int, operand_ty, result);
+        const diff = try block.addBinOp(if (block.float_mode == .optimized) .sub_optimized else .sub, operand, back);
+        const ok = if (is_vector) ok: {
+            const ok_pos = try block.addCmpVector(diff, Air.internedToRef((try sema.splat(operand_ty, try pt.floatValue(operand_scalar_ty, 1.0))).toIntern()), .lt);
+            const ok_neg = try block.addCmpVector(diff, Air.internedToRef((try sema.splat(operand_ty, try pt.floatValue(operand_scalar_ty, -1.0))).toIntern()), .gt);
+            const ok = try block.addBinOp(.bit_and, ok_pos, ok_neg);
+            break :ok try block.addReduce(ok, .And);
+        } else ok: {
+            const ok_pos = try block.addBinOp(if (block.float_mode == .optimized) .cmp_lt_optimized else .cmp_lt, diff, Air.internedToRef((try pt.floatValue(operand_ty, 1.0)).toIntern()));
+            const ok_neg = try block.addBinOp(if (block.float_mode == .optimized) .cmp_gt_optimized else .cmp_gt, diff, Air.internedToRef((try pt.floatValue(operand_ty, -1.0)).toIntern()));
+            break :ok try block.addBinOp(.bool_and, ok_pos, ok_neg);
+        };
+        try sema.addSafetyCheck(block, src, ok, .integer_part_out_of_bounds);
     }
-    return block.addAggregateInit(dest_ty, new_elems);
+    return result;
 }
 
 fn zirFloatFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -22377,7 +22330,6 @@ fn zirFloatFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileErro
     const operand_ty = sema.typeOf(operand);
 
     try sema.checkVectorizableBinaryOperands(block, operand_src, dest_ty, operand_ty, src, operand_src);
-    const is_vector = dest_ty.zigTypeTag(zcu) == .vector;
 
     const dest_scalar_ty = dest_ty.scalarType(zcu);
     const operand_scalar_ty = operand_ty.scalarType(zcu);
@@ -22393,17 +22345,7 @@ fn zirFloatFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileErro
     }
 
     try sema.requireRuntimeBlock(block, src, operand_src);
-    if (!is_vector or zcu.backendSupportsFeature(.all_vector_instructions)) {
-        return block.addTyOp(.float_from_int, dest_ty, operand);
-    }
-    const len = operand_ty.vectorLen(zcu);
-    const new_elems = try sema.arena.alloc(Air.Inst.Ref, len);
-    for (new_elems, 0..) |*new_elem, i| {
-        const idx_ref = try pt.intRef(.usize, i);
-        const old_elem = try block.addBinOp(.array_elem_val, operand, idx_ref);
-        new_elem.* = try block.addTyOp(.float_from_int, dest_scalar_ty, old_elem);
-    }
-    return block.addAggregateInit(dest_ty, new_elems);
+    return block.addTyOp(.float_from_int, dest_ty, operand);
 }
 
 fn zirPtrFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -22473,69 +22415,34 @@ fn zirPtrFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
     }
     try sema.requireRuntimeBlock(block, src, operand_src);
     try sema.checkLogicalPtrOperation(block, src, ptr_ty);
-    if (!is_vector or zcu.backendSupportsFeature(.all_vector_instructions)) {
-        if (block.wantSafety() and (try elem_ty.hasRuntimeBitsSema(pt) or elem_ty.zigTypeTag(zcu) == .@"fn")) {
-            if (!ptr_ty.isAllowzeroPtr(zcu)) {
-                const is_non_zero = if (is_vector) all_non_zero: {
-                    const zero_usize = Air.internedToRef((try sema.splat(operand_ty, .zero_usize)).toIntern());
-                    const is_non_zero = try block.addCmpVector(operand_coerced, zero_usize, .neq);
-                    break :all_non_zero try block.addReduce(is_non_zero, .And);
-                } else try block.addBinOp(.cmp_neq, operand_coerced, .zero_usize);
-                try sema.addSafetyCheck(block, src, is_non_zero, .cast_to_null);
-            }
-            if (ptr_align.compare(.gt, .@"1")) {
-                const align_bytes_minus_1 = ptr_align.toByteUnits().? - 1;
-                const align_mask = Air.internedToRef((try sema.splat(operand_ty, try pt.intValue(
-                    .usize,
-                    if (elem_ty.fnPtrMaskOrNull(zcu)) |mask|
-                        align_bytes_minus_1 & mask
-                    else
-                        align_bytes_minus_1,
-                ))).toIntern());
-                const remainder = try block.addBinOp(.bit_and, operand_coerced, align_mask);
-                const is_aligned = if (is_vector) all_aligned: {
-                    const splat_zero_usize = Air.internedToRef((try sema.splat(operand_ty, .zero_usize)).toIntern());
-                    const is_aligned = try block.addCmpVector(remainder, splat_zero_usize, .eq);
-                    break :all_aligned try block.addReduce(is_aligned, .And);
-                } else try block.addBinOp(.cmp_eq, remainder, .zero_usize);
-                try sema.addSafetyCheck(block, src, is_aligned, .incorrect_alignment);
-            }
-        }
-        return block.addBitCast(dest_ty, operand_coerced);
-    }
-
-    const len = dest_ty.vectorLen(zcu);
     if (block.wantSafety() and (try elem_ty.hasRuntimeBitsSema(pt) or elem_ty.zigTypeTag(zcu) == .@"fn")) {
-        for (0..len) |i| {
-            const idx_ref = try pt.intRef(.usize, i);
-            const elem_coerced = try block.addBinOp(.array_elem_val, operand_coerced, idx_ref);
-            if (!ptr_ty.isAllowzeroPtr(zcu)) {
-                const is_non_zero = try block.addBinOp(.cmp_neq, elem_coerced, .zero_usize);
-                try sema.addSafetyCheck(block, src, is_non_zero, .cast_to_null);
-            }
-            if (ptr_align.compare(.gt, .@"1")) {
-                const align_bytes_minus_1 = ptr_align.toByteUnits().? - 1;
-                const align_mask = Air.internedToRef((try pt.intValue(
-                    .usize,
-                    if (elem_ty.fnPtrMaskOrNull(zcu)) |mask|
-                        align_bytes_minus_1 & mask
-                    else
-                        align_bytes_minus_1,
-                )).toIntern());
-                const remainder = try block.addBinOp(.bit_and, elem_coerced, align_mask);
-                const is_aligned = try block.addBinOp(.cmp_eq, remainder, .zero_usize);
-                try sema.addSafetyCheck(block, src, is_aligned, .incorrect_alignment);
-            }
-        }
-    }
-
-    const new_elems = try sema.arena.alloc(Air.Inst.Ref, len);
-    for (new_elems, 0..) |*new_elem, i| {
-        const idx_ref = try pt.intRef(.usize, i);
-        const old_elem = try block.addBinOp(.array_elem_val, operand_coerced, idx_ref);
-        new_elem.* = try block.addBitCast(ptr_ty, old_elem);
-    }
-    return block.addAggregateInit(dest_ty, new_elems);
+        if (!ptr_ty.isAllowzeroPtr(zcu)) {
+            const is_non_zero = if (is_vector) all_non_zero: {
+                const zero_usize = Air.internedToRef((try sema.splat(operand_ty, .zero_usize)).toIntern());
+                const is_non_zero = try block.addCmpVector(operand_coerced, zero_usize, .neq);
+                break :all_non_zero try block.addReduce(is_non_zero, .And);
+            } else try block.addBinOp(.cmp_neq, operand_coerced, .zero_usize);
+            try sema.addSafetyCheck(block, src, is_non_zero, .cast_to_null);
+        }
+        if (ptr_align.compare(.gt, .@"1")) {
+            const align_bytes_minus_1 = ptr_align.toByteUnits().? - 1;
+            const align_mask = Air.internedToRef((try sema.splat(operand_ty, try pt.intValue(
+                .usize,
+                if (elem_ty.fnPtrMaskOrNull(zcu)) |mask|
+                    align_bytes_minus_1 & mask
+                else
+                    align_bytes_minus_1,
+            ))).toIntern());
+            const remainder = try block.addBinOp(.bit_and, operand_coerced, align_mask);
+            const is_aligned = if (is_vector) all_aligned: {
+                const splat_zero_usize = Air.internedToRef((try sema.splat(operand_ty, .zero_usize)).toIntern());
+                const is_aligned = try block.addCmpVector(remainder, splat_zero_usize, .eq);
+                break :all_aligned try block.addReduce(is_aligned, .And);
+            } else try block.addBinOp(.cmp_eq, remainder, .zero_usize);
+            try sema.addSafetyCheck(block, src, is_aligned, .incorrect_alignment);
+        }
+    }
+    return block.addBitCast(dest_ty, operand_coerced);
 }
 
 fn ptrFromIntVal(
diff --git a/src/Zcu.zig b/src/Zcu.zig
index c49a1d46b1..38e926298e 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -3840,15 +3840,6 @@ pub const Feature = enum {
     safety_checked_instructions,
     /// If the backend supports running from another thread.
     separate_thread,
-    /// If the backend supports the following AIR instructions with vector types:
-    /// * `Air.Inst.Tag.bit_and`
-    /// * `Air.Inst.Tag.bit_or`
-    /// * `Air.Inst.Tag.bitcast`
-    /// * `Air.Inst.Tag.float_from_int`
-    /// * `Air.Inst.Tag.fptrunc`
-    /// * `Air.Inst.Tag.int_from_float`
-    /// If not supported, Sema will scalarize the operation.
-    all_vector_instructions,
 };
 
 pub fn backendSupportsFeature(zcu: *const Zcu, comptime feature: Feature) bool {
diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig
index 6fd23cfd18..88fdeb9831 100644
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@@ -40,6 +40,10 @@ const gp = abi.RegisterClass.gp;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 gpa: Allocator,
 pt: Zcu.PerThread,
 air: Air,
@@ -2261,12 +2265,13 @@ fn shiftExact(
     rhs_ty: Type,
     maybe_inst: ?Air.Inst.Index,
 ) InnerError!MCValue {
-    _ = rhs_ty;
-
     const pt = self.pt;
     const zcu = pt.zcu;
     switch (lhs_ty.zigTypeTag(zcu)) {
-        .vector => return self.fail("TODO binary operations on vectors", .{}),
+        .vector => if (!rhs_ty.isVector(zcu))
+            return self.fail("TODO vector shift with scalar rhs", .{})
+        else
+            return self.fail("TODO binary operations on vectors", .{}),
         .int => {
             const int_info = lhs_ty.intInfo(zcu);
             if (int_info.bits <= 64) {
@@ -2317,7 +2322,10 @@ fn shiftNormal(
     const pt = self.pt;
     const zcu = pt.zcu;
     switch (lhs_ty.zigTypeTag(zcu)) {
-        .vector => return self.fail("TODO binary operations on vectors", .{}),
+        .vector => if (!rhs_ty.isVector(zcu))
+            return self.fail("TODO vector shift with scalar rhs", .{})
+        else
+            return self.fail("TODO binary operations on vectors", .{}),
         .int => {
             const int_info = lhs_ty.intInfo(zcu);
             if (int_info.bits <= 64) {
@@ -2874,7 +2882,10 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) InnerError!void {
         const overflow_bit_offset = @as(u32, @intCast(tuple_ty.structFieldOffset(1, zcu)));
 
         switch (lhs_ty.zigTypeTag(zcu)) {
-            .vector => return self.fail("TODO implement shl_with_overflow for vectors", .{}),
+            .vector => if (!rhs_ty.isVector(zcu))
+                return self.fail("TODO implement vector shl_with_overflow with scalar rhs", .{})
+            else
+                return self.fail("TODO implement shl_with_overflow for vectors", .{}),
             .int => {
                 const int_info = lhs_ty.intInfo(zcu);
                 if (int_info.bits <= 64) {
@@ -2993,8 +3004,14 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) InnerError!void {
 }
 
 fn airShlSat(self: *Self, inst: Air.Inst.Index) InnerError!void {
+    const zcu = self.pt.zcu;
     const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement shl_sat for {}", .{self.target.cpu.arch});
+    const result: MCValue = if (self.liveness.isUnused(inst))
+        .dead
+    else if (self.typeOf(bin_op.lhs).isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu))
+        return self.fail("TODO implement vector shl_sat with scalar rhs for {}", .{self.target.cpu.arch})
+    else
+        return self.fail("TODO implement shl_sat for {}", .{self.target.cpu.arch});
     return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig
index e9d0e91db1..a018302566 100644
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@@ -41,6 +41,10 @@ const gp = abi.RegisterClass.gp;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 gpa: Allocator,
 pt: Zcu.PerThread,
 air: Air,
@@ -1857,7 +1861,10 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
         const overflow_bit_offset: u32 = @intCast(tuple_ty.structFieldOffset(1, zcu));
 
         switch (lhs_ty.zigTypeTag(zcu)) {
-            .vector => return self.fail("TODO implement shl_with_overflow for vectors", .{}),
+            .vector => if (!rhs_ty.isVector(zcu))
+                return self.fail("TODO implement vector shl_with_overflow with scalar rhs", .{})
+            else
+                return self.fail("TODO implement shl_with_overflow for vectors", .{}),
             .int => {
                 const int_info = lhs_ty.intInfo(zcu);
                 if (int_info.bits <= 32) {
@@ -1978,8 +1985,14 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airShlSat(self: *Self, inst: Air.Inst.Index) !void {
+    const zcu = self.pt.zcu;
     const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement shl_sat for {}", .{self.target.cpu.arch});
+    const result: MCValue = if (self.liveness.isUnused(inst))
+        .dead
+    else if (self.typeOf(bin_op.lhs).isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu))
+        return self.fail("TODO implement vector shl_sat with scalar rhs for {}", .{self.target.cpu.arch})
+    else
+        return self.fail("TODO implement shl_sat for {}", .{self.target.cpu.arch});
     return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
@@ -3788,7 +3801,10 @@ fn shiftExact(
     const pt = self.pt;
     const zcu = pt.zcu;
     switch (lhs_ty.zigTypeTag(zcu)) {
-        .vector => return self.fail("TODO ARM binary operations on vectors", .{}),
+        .vector => if (!rhs_ty.isVector(zcu))
+            return self.fail("TODO ARM vector shift with scalar rhs", .{})
+        else
+            return self.fail("TODO ARM binary operations on vectors", .{}),
         .int => {
             const int_info = lhs_ty.intInfo(zcu);
             if (int_info.bits <= 32) {
@@ -3828,7 +3844,10 @@ fn shiftNormal(
     const pt = self.pt;
     const zcu = pt.zcu;
     switch (lhs_ty.zigTypeTag(zcu)) {
-        .vector => return self.fail("TODO ARM binary operations on vectors", .{}),
+        .vector => if (!rhs_ty.isVector(zcu))
+            return self.fail("TODO ARM vector shift with scalar rhs", .{})
+        else
+            return self.fail("TODO ARM binary operations on vectors", .{}),
         .int => {
             const int_info = lhs_ty.intInfo(zcu);
             if (int_info.bits <= 32) {
diff --git a/src/arch/powerpc/CodeGen.zig b/src/arch/powerpc/CodeGen.zig
index 6334b65ff8..a3a4615b4a 100644
--- a/src/arch/powerpc/CodeGen.zig
+++ b/src/arch/powerpc/CodeGen.zig
@@ -10,6 +10,10 @@ const Zcu = @import("../../Zcu.zig");
 const assert = std.debug.assert;
 const log = std.log.scoped(.codegen);
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 pub fn generate(
     bin_file: *link.File,
     pt: Zcu.PerThread,
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index edba985beb..5732f4cd41 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -51,6 +51,10 @@ const Instruction = encoding.Instruction;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 pt: Zcu.PerThread,
 air: Air,
 liveness: Air.Liveness,
@@ -2764,6 +2768,7 @@ fn genBinOp(
         .shl,
         .shl_exact,
         => {
+            if (lhs_ty.isVector(zcu) and !rhs_ty.isVector(zcu)) return func.fail("TODO: vector shift with scalar rhs", .{});
             if (bit_size > 64) return func.fail("TODO: genBinOp shift > 64 bits, {}", .{bit_size});
             try func.truncateRegister(rhs_ty, rhs_reg);
 
@@ -3248,8 +3253,14 @@ fn airMulWithOverflow(func: *Func, inst: Air.Inst.Index) !void {
 }
 
 fn airShlWithOverflow(func: *Func, inst: Air.Inst.Index) !void {
+    const zcu = func.pt.zcu;
     const bin_op = func.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const result: MCValue = if (func.liveness.isUnused(inst)) .unreach else return func.fail("TODO implement airShlWithOverflow", .{});
+    const result: MCValue = if (func.liveness.isUnused(inst))
+        .unreach
+    else if (func.typeOf(bin_op.lhs).isVector(zcu) and !func.typeOf(bin_op.rhs).isVector(zcu))
+        return func.fail("TODO implement vector airShlWithOverflow with scalar rhs", .{})
+    else
+        return func.fail("TODO implement airShlWithOverflow", .{});
     return func.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
@@ -3266,8 +3277,14 @@ fn airMulSat(func: *Func, inst: Air.Inst.Index) !void {
 }
 
 fn airShlSat(func: *Func, inst: Air.Inst.Index) !void {
+    const zcu = func.pt.zcu;
     const bin_op = func.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const result: MCValue = if (func.liveness.isUnused(inst)) .unreach else return func.fail("TODO implement airShlSat", .{});
+    const result: MCValue = if (func.liveness.isUnused(inst))
+        .unreach
+    else if (func.typeOf(bin_op.lhs).isVector(zcu) and !func.typeOf(bin_op.rhs).isVector(zcu))
+        return func.fail("TODO implement vector airShlSat with scalar rhs", .{})
+    else
+        return func.fail("TODO implement airShlSat", .{});
     return func.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig
index 4235de94f5..ead7ef3e99 100644
--- a/src/arch/sparc64/CodeGen.zig
+++ b/src/arch/sparc64/CodeGen.zig
@@ -41,6 +41,10 @@ const Self = @This();
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 const RegisterView = enum(u1) {
     caller,
     callee,
@@ -2270,8 +2274,14 @@ fn airSetUnionTag(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airShlSat(self: *Self, inst: Air.Inst.Index) !void {
+    const zcu = self.pt.zcu;
     const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement shl_sat for {}", .{self.target.cpu.arch});
+    const result: MCValue = if (self.liveness.isUnused(inst))
+        .dead
+    else if (self.typeOf(bin_op.lhs).isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu))
+        return self.fail("TODO implement vector shl_sat with scalar rhs for {}", .{self.target.cpu.arch})
+    else
+        return self.fail("TODO implement shl_sat for {}", .{self.target.cpu.arch});
     return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
@@ -2287,7 +2297,10 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
         const rhs_ty = self.typeOf(extra.rhs);
 
         switch (lhs_ty.zigTypeTag(zcu)) {
-            .vector => return self.fail("TODO implement mul_with_overflow for vectors", .{}),
+            .vector => if (!rhs_ty.isVector(zcu))
+                return self.fail("TODO implement vector shl_with_overflow with scalar rhs", .{})
+            else
+                return self.fail("TODO implement mul_with_overflow for vectors", .{}),
             .int => {
                 const int_info = lhs_ty.intInfo(zcu);
                 if (int_info.bits <= 64) {
@@ -3002,7 +3015,10 @@ fn binOp(
 
             // Truncate if necessary
             switch (lhs_ty.zigTypeTag(zcu)) {
-                .vector => return self.fail("TODO binary operations on vectors", .{}),
+                .vector => if (rhs_ty.isVector(zcu))
+                    return self.fail("TODO vector shift with scalar rhs", .{})
+                else
+                    return self.fail("TODO binary operations on vectors", .{}),
                 .int => {
                     const int_info = lhs_ty.intInfo(zcu);
                     if (int_info.bits <= 64) {
@@ -3024,7 +3040,10 @@ fn binOp(
         .shr_exact,
         => {
             switch (lhs_ty.zigTypeTag(zcu)) {
-                .vector => return self.fail("TODO binary operations on vectors", .{}),
+                .vector => if (rhs_ty.isVector(zcu))
+                    return self.fail("TODO vector shift with scalar rhs", .{})
+                else
+                    return self.fail("TODO binary operations on vectors", .{}),
                 .int => {
                     const int_info = lhs_ty.intInfo(zcu);
                     if (int_info.bits <= 64) {
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index a48f7012f5..a130f1508d 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -31,6 +31,10 @@ const libcFloatSuffix = target_util.libcFloatSuffix;
 const compilerRtFloatAbbrev = target_util.compilerRtFloatAbbrev;
 const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 /// Reference to the function declaration the code
 /// section belongs to
 owner_nav: InternPool.Nav.Index,
@@ -2638,6 +2642,10 @@ fn airBinOp(cg: *CodeGen, inst: Air.Inst.Index, op: Op) InnerError!void {
     // For big integers we can ignore this as we will call into compiler-rt which handles this.
     const result = switch (op) {
         .shr, .shl => result: {
+            if (lhs_ty.isVector(zcu) and !rhs_ty.isVector(zcu)) {
+                return cg.fail("TODO: implement vector '{s}' with scalar rhs", .{@tagName(op)});
+            }
+
             const lhs_wasm_bits = toWasmBits(@intCast(lhs_ty.bitSize(zcu))) orelse {
                 return cg.fail("TODO: implement '{s}' for types larger than 128 bits", .{@tagName(op)});
             };
@@ -3055,8 +3063,12 @@ fn airWrapBinOp(cg: *CodeGen, inst: Air.Inst.Index, op: Op) InnerError!void {
     const lhs_ty = cg.typeOf(bin_op.lhs);
     const rhs_ty = cg.typeOf(bin_op.rhs);
 
-    if (lhs_ty.zigTypeTag(zcu) == .vector or rhs_ty.zigTypeTag(zcu) == .vector) {
-        return cg.fail("TODO: Implement wrapping arithmetic for vectors", .{});
+    if (lhs_ty.isVector(zcu)) {
+        if ((op == .shr or op == .shl) and !rhs_ty.isVector(zcu)) {
+            return cg.fail("TODO: implement wrapping vector '{s}' with scalar rhs", .{@tagName(op)});
+        } else {
+            return cg.fail("TODO: implement wrapping '{s}' for vectors", .{@tagName(op)});
+        }
     }
 
     // For certain operations, such as shifting, the types are different.
@@ -6067,13 +6079,17 @@ fn airShlWithOverflow(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const ty = cg.typeOf(extra.lhs);
     const rhs_ty = cg.typeOf(extra.rhs);
 
-    if (ty.zigTypeTag(zcu) == .vector) {
-        return cg.fail("TODO: Implement overflow arithmetic for vectors", .{});
+    if (ty.isVector(zcu)) {
+        if (!rhs_ty.isVector(zcu)) {
+            return cg.fail("TODO: implement vector 'shl_with_overflow' with scalar rhs", .{});
+        } else {
+            return cg.fail("TODO: implement vector 'shl_with_overflow'", .{});
+        }
     }
 
     const int_info = ty.intInfo(zcu);
     const wasm_bits = toWasmBits(int_info.bits) orelse {
-        return cg.fail("TODO: Implement shl_with_overflow for integer bitsize: {d}", .{int_info.bits});
+        return cg.fail("TODO: implement 'shl_with_overflow' for integer bitsize: {d}", .{int_info.bits});
     };
 
     // Ensure rhs is coerced to lhs as they must have the same WebAssembly types
@@ -6994,6 +7010,11 @@ fn airShlSat(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
 
     const pt = cg.pt;
     const zcu = pt.zcu;
+
+    if (cg.typeOf(bin_op.lhs).isVector(zcu) and !cg.typeOf(bin_op.rhs).isVector(zcu)) {
+        return cg.fail("TODO: implement vector 'shl_sat' with scalar rhs", .{});
+    }
+
     const ty = cg.typeOfIndex(inst);
     const int_info = ty.intInfo(zcu);
     const is_signed = int_info.signedness == .signed;
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 8db15122d0..49a32fc66c 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -32,7 +32,7 @@ const FrameIndex = bits.FrameIndex;
 
 const InnerError = codegen.CodeGenError || error{OutOfRegisters};
 
-pub inline fn legalizeFeatures(target: *const std.Target) *const Air.Legalize.Features {
+pub fn legalizeFeatures(target: *const std.Target) *const Air.Legalize.Features {
     @setEvalBranchQuota(1_200);
     return switch (target.ofmt == .coff) {
         inline false, true => |use_old| comptime &.init(.{
@@ -86,7 +86,7 @@ pub inline fn legalizeFeatures(target: *const std.Target) *const Air.Legalize.Fe
             .scalarize_float_from_int = use_old,
             .scalarize_mul_add = use_old,
 
-            .remove_shift_vector_rhs_splat = false,
+            .unsplat_shift_rhs = false,
             .reduce_one_elem_to_bitcast = true,
         }),
     };
diff --git a/src/codegen.zig b/src/codegen.zig
index d482cca4b2..8d609682f1 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -52,7 +52,7 @@ fn importBackend(comptime backend: std.builtin.CompilerBackend) type {
 pub fn legalizeFeatures(pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) *const Air.Legalize.Features {
     const zcu = pt.zcu;
     const target = &zcu.navFileScope(nav_index).mod.?.resolved_target.result;
-    switch (target_util.zigBackend(target.*, zcu.comp.config.use_llvm)) {
+    return switch (target_util.zigBackend(target.*, zcu.comp.config.use_llvm)) {
         else => unreachable,
         inline .stage2_llvm,
         .stage2_c,
@@ -65,11 +65,8 @@ pub fn legalizeFeatures(pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) *con
         .stage2_sparc64,
         .stage2_spirv64,
         .stage2_powerpc,
-        => |backend| {
-            const Backend = importBackend(backend);
-            return if (@hasDecl(Backend, "legalizeFeatures")) Backend.legalizeFeatures(target) else comptime &.initEmpty();
-        },
-    }
+        => |backend| importBackend(backend).legalizeFeatures(target),
+    };
 }
 
 pub fn generateFunction(
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index d83eb8f771..2d3b236a86 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -20,6 +20,10 @@ const Alignment = InternPool.Alignment;
 const BigIntLimb = std.math.big.Limb;
 const BigInt = std.math.big.int;
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 pub const CType = @import("c/Type.zig");
 
 pub const CValue = union(enum) {
@@ -4179,7 +4183,7 @@ fn airOverflow(f: *Function, inst: Air.Inst.Index, operation: []const u8, info:
     try v.elem(f, w);
     try w.writeAll(", ");
     try f.writeCValue(w, rhs, .FunctionArgument);
-    try v.elem(f, w);
+    if (f.typeOf(bin_op.rhs).isVector(zcu)) try v.elem(f, w);
     try f.object.dg.renderBuiltinInfo(w, scalar_ty, info);
     try w.writeAll(");\n");
     try v.end(f, inst, w);
@@ -6536,7 +6540,7 @@ fn airBinBuiltinCall(
     try v.elem(f, writer);
     try writer.writeAll(", ");
     try f.writeCValue(writer, rhs, .FunctionArgument);
-    try v.elem(f, writer);
+    if (f.typeOf(bin_op.rhs).isVector(zcu)) try v.elem(f, writer);
     try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info);
     try writer.writeAll(");\n");
     try v.end(f, inst, writer);
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 1820faf90c..6f411e7490 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -36,6 +36,10 @@ const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
 const Error = error{ OutOfMemory, CodegenFail };
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 fn subArchName(features: std.Target.Cpu.Feature.Set, arch: anytype, mappings: anytype) ?[]const u8 {
     inline for (mappings) |mapping| {
         if (arch.featureSetHas(features, mapping[0])) return mapping[1];
@@ -8923,6 +8927,8 @@ pub const FuncGen = struct {
         const rhs = try self.resolveInst(extra.rhs);
 
         const lhs_ty = self.typeOf(extra.lhs);
+        if (lhs_ty.isVector(zcu) and !self.typeOf(extra.rhs).isVector(zcu))
+            return self.ng.todo("implement vector shifts with scalar rhs", .{});
         const lhs_scalar_ty = lhs_ty.scalarType(zcu);
 
         const dest_ty = self.typeOfIndex(inst);
@@ -8992,6 +8998,8 @@ pub const FuncGen = struct {
         const rhs = try self.resolveInst(bin_op.rhs);
 
         const lhs_ty = self.typeOf(bin_op.lhs);
+        if (lhs_ty.isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu))
+            return self.ng.todo("implement vector shifts with scalar rhs", .{});
         const lhs_scalar_ty = lhs_ty.scalarType(zcu);
 
         const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
@@ -9003,14 +9011,17 @@ pub const FuncGen = struct {
 
     fn airShl(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
         const o = self.ng.object;
+        const zcu = o.pt.zcu;
         const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
 
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
 
-        const lhs_type = self.typeOf(bin_op.lhs);
+        const lhs_ty = self.typeOf(bin_op.lhs);
+        if (lhs_ty.isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu))
+            return self.ng.todo("implement vector shifts with scalar rhs", .{});
 
-        const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_type), "");
+        const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
         return self.wip.bin(.shl, lhs, casted_rhs, "");
     }
 
@@ -9029,6 +9040,8 @@ pub const FuncGen = struct {
         const llvm_lhs_scalar_ty = llvm_lhs_ty.scalarType(&o.builder);
 
         const rhs_ty = self.typeOf(bin_op.rhs);
+        if (lhs_ty.isVector(zcu) and !rhs_ty.isVector(zcu))
+            return self.ng.todo("implement vector shifts with scalar rhs", .{});
         const rhs_info = rhs_ty.intInfo(zcu);
         assert(rhs_info.signedness == .unsigned);
         const llvm_rhs_ty = try o.lowerType(rhs_ty);
@@ -9101,6 +9114,8 @@ pub const FuncGen = struct {
         const rhs = try self.resolveInst(bin_op.rhs);
 
         const lhs_ty = self.typeOf(bin_op.lhs);
+        if (lhs_ty.isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu))
+            return self.ng.todo("implement vector shifts with scalar rhs", .{});
         const lhs_scalar_ty = lhs_ty.scalarType(zcu);
 
         const casted_rhs = try self.wip.conv(.unsigned, rhs, try o.lowerType(lhs_ty), "");
@@ -9255,8 +9270,6 @@ pub const FuncGen = struct {
         const operand_ty = self.typeOf(ty_op.operand);
         const dest_ty = self.typeOfIndex(inst);
         const target = zcu.getTarget();
-        const dest_bits = dest_ty.floatBits(target);
-        const src_bits = operand_ty.floatBits(target);
 
         if (intrinsicsAllowed(dest_ty, target) and intrinsicsAllowed(operand_ty, target)) {
             return self.wip.cast(.fptrunc, operand, try o.lowerType(dest_ty), "");
@@ -9264,6 +9277,8 @@ pub const FuncGen = struct {
             const operand_llvm_ty = try o.lowerType(operand_ty);
             const dest_llvm_ty = try o.lowerType(dest_ty);
 
+            const dest_bits = dest_ty.floatBits(target);
+            const src_bits = operand_ty.floatBits(target);
             const fn_name = try o.builder.strtabStringFmt("__trunc{s}f{s}f2", .{
                 compilerRtFloatAbbrev(src_bits), compilerRtFloatAbbrev(dest_bits),
             });
@@ -9348,11 +9363,12 @@ pub const FuncGen = struct {
             return self.wip.conv(.unsigned, operand, llvm_dest_ty, "");
         }
 
-        if (operand_ty.zigTypeTag(zcu) == .int and inst_ty.isPtrAtRuntime(zcu)) {
+        const operand_scalar_ty = operand_ty.scalarType(zcu);
+        const inst_scalar_ty = inst_ty.scalarType(zcu);
+        if (operand_scalar_ty.zigTypeTag(zcu) == .int and inst_scalar_ty.isPtrAtRuntime(zcu)) {
             return self.wip.cast(.inttoptr, operand, llvm_dest_ty, "");
         }
-
-        if (operand_ty.isPtrAtRuntime(zcu) and inst_ty.zigTypeTag(zcu) == .int) {
+        if (operand_scalar_ty.isPtrAtRuntime(zcu) and inst_scalar_ty.zigTypeTag(zcu) == .int) {
             return self.wip.cast(.ptrtoint, operand, llvm_dest_ty, "");
         }
 
diff --git a/src/codegen/spirv.zig b/src/codegen/spirv.zig
index 5041634a75..c51e38ac7e 100644
--- a/src/codegen/spirv.zig
+++ b/src/codegen/spirv.zig
@@ -28,6 +28,10 @@ const SpvAssembler = @import("spirv/Assembler.zig");
 
 const InstMap = std.AutoHashMapUnmanaged(Air.Inst.Index, IdRef);
 
+pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
+    return comptime &.initEmpty();
+}
+
 pub const zig_call_abi_ver = 3;
 pub const big_int_bits = 32;
 
@@ -3380,6 +3384,10 @@ const NavGen = struct {
         const zcu = self.pt.zcu;
         const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
 
+        if (self.typeOf(bin_op.lhs).isVector(zcu) and !self.typeOf(bin_op.rhs).isVector(zcu)) {
+            return self.fail("vector shift with scalar rhs", .{});
+        }
+
         const base = try self.temporary(bin_op.lhs);
         const shift = try self.temporary(bin_op.rhs);
 
@@ -3866,6 +3874,10 @@ const NavGen = struct {
         const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
         const extra = self.air.extraData(Air.Bin, ty_pl.payload).data;
 
+        if (self.typeOf(extra.lhs).isVector(zcu) and !self.typeOf(extra.rhs).isVector(zcu)) {
+            return self.fail("vector shift with scalar rhs", .{});
+        }
+
         const base = try self.temporary(extra.lhs);
         const shift = try self.temporary(extra.rhs);
 
diff --git a/src/target.zig b/src/target.zig
index 6119b002a4..c33588b1b5 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -850,9 +850,5 @@ pub inline fn backendSupportsFeature(backend: std.builtin.CompilerBackend, compt
             .stage2_llvm => false,
             else => true,
         },
-        .all_vector_instructions => switch (backend) {
-            .stage2_x86_64 => true,
-            else => false,
-        },
     };
 }
-- 
cgit v1.2.3


From 77e6513030a6258a893c2f11ad9708c9612b7715 Mon Sep 17 00:00:00 2001
From: Jacob Young <jacobly0@users.noreply.github.com>
Date: Fri, 30 May 2025 12:56:52 -0400
Subject: cbe: implement `stdbool.h` reserved identifiers

Also remove the legalize pass from zig1.
---
 src/Air/Legalize.zig         |  4 +++-
 src/Zcu/PerThread.zig        |  4 +++-
 src/arch/aarch64/CodeGen.zig |  4 ++--
 src/arch/arm/CodeGen.zig     |  4 ++--
 src/arch/powerpc/CodeGen.zig |  4 ++--
 src/arch/riscv64/CodeGen.zig |  4 ++--
 src/arch/sparc64/CodeGen.zig |  4 ++--
 src/arch/wasm/CodeGen.zig    |  4 ++--
 src/codegen.zig              | 30 +++++++++++++++++++++++-------
 src/codegen/c.zig            | 10 +++++++---
 src/codegen/llvm.zig         |  4 ++--
 src/codegen/spirv.zig        |  4 ++--
 src/dev.zig                  |  3 +++
 13 files changed, 55 insertions(+), 28 deletions(-)

(limited to 'src/codegen')

diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig
index 870270f089..221e98aebb 100644
--- a/src/Air/Legalize.zig
+++ b/src/Air/Legalize.zig
@@ -164,13 +164,14 @@ pub const Features = std.enums.EnumSet(Feature);
 pub const Error = std.mem.Allocator.Error;
 
 pub fn legalize(air: *Air, pt: Zcu.PerThread, features: *const Features) Error!void {
+    dev.check(.legalize);
+    assert(!features.bits.eql(.initEmpty())); // backend asked to run legalize, but no features were enabled
     var l: Legalize = .{
         .pt = pt,
         .air_instructions = air.instructions.toMultiArrayList(),
         .air_extra = air.extra,
         .features = features,
     };
-    if (l.features.bits.eql(.initEmpty())) return;
     defer air.* = l.getTmpAir();
     const main_extra = l.extraData(Air.Block, l.air_extra.items[@intFromEnum(Air.ExtraIndex.main_block)]);
     try l.legalizeBody(main_extra.end, main_extra.data.body_len);
@@ -845,6 +846,7 @@ inline fn replaceInst(l: *Legalize, inst: Air.Inst.Index, tag: Air.Inst.Tag, dat
 
 const Air = @import("../Air.zig");
 const assert = std.debug.assert;
+const dev = @import("../dev.zig");
 const Legalize = @This();
 const std = @import("std");
 const Type = @import("../Type.zig");
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index 93f87b506b..a67d4dcc05 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -1741,7 +1741,9 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A
         return;
     }
 
-    try air.legalize(pt, @import("../codegen.zig").legalizeFeatures(pt, nav_index));
+    legalize: {
+        try air.legalize(pt, @import("../codegen.zig").legalizeFeatures(pt, nav_index) orelse break :legalize);
+    }
 
     var liveness = try Air.Liveness.analyze(gpa, air.*, ip);
     defer liveness.deinit(gpa);
diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig
index 88fdeb9831..e9e7159938 100644
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@@ -40,8 +40,8 @@ const gp = abi.RegisterClass.gp;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 gpa: Allocator,
diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig
index a018302566..8cc1d0a607 100644
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@@ -41,8 +41,8 @@ const gp = abi.RegisterClass.gp;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 gpa: Allocator,
diff --git a/src/arch/powerpc/CodeGen.zig b/src/arch/powerpc/CodeGen.zig
index a3a4615b4a..0cfee67ebd 100644
--- a/src/arch/powerpc/CodeGen.zig
+++ b/src/arch/powerpc/CodeGen.zig
@@ -10,8 +10,8 @@ const Zcu = @import("../../Zcu.zig");
 const assert = std.debug.assert;
 const log = std.log.scoped(.codegen);
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 pub fn generate(
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 5732f4cd41..981b5a800f 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -51,8 +51,8 @@ const Instruction = encoding.Instruction;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 pt: Zcu.PerThread,
diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig
index ead7ef3e99..d473222288 100644
--- a/src/arch/sparc64/CodeGen.zig
+++ b/src/arch/sparc64/CodeGen.zig
@@ -41,8 +41,8 @@ const Self = @This();
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 const RegisterView = enum(u1) {
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index a130f1508d..ec448ca29b 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -31,8 +31,8 @@ const libcFloatSuffix = target_util.libcFloatSuffix;
 const compilerRtFloatAbbrev = target_util.compilerRtFloatAbbrev;
 const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 /// Reference to the function declaration the code
diff --git a/src/codegen.zig b/src/codegen.zig
index 8d609682f1..1f794bbeea 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -27,9 +27,22 @@ pub const CodeGenError = GenerateSymbolError || error{
     CodegenFail,
 };
 
-fn devFeatureForBackend(comptime backend: std.builtin.CompilerBackend) dev.Feature {
-    comptime assert(mem.startsWith(u8, @tagName(backend), "stage2_"));
-    return @field(dev.Feature, @tagName(backend)["stage2_".len..] ++ "_backend");
+fn devFeatureForBackend(backend: std.builtin.CompilerBackend) dev.Feature {
+    return switch (backend) {
+        .other, .stage1 => unreachable,
+        .stage2_aarch64 => .aarch64_backend,
+        .stage2_arm => .arm_backend,
+        .stage2_c => .c_backend,
+        .stage2_llvm => .llvm_backend,
+        .stage2_powerpc => .powerpc_backend,
+        .stage2_riscv64 => .riscv64_backend,
+        .stage2_sparc64 => .sparc64_backend,
+        .stage2_spirv64 => .spirv64_backend,
+        .stage2_wasm => .wasm_backend,
+        .stage2_x86 => .x86_backend,
+        .stage2_x86_64 => .x86_64_backend,
+        _ => unreachable,
+    };
 }
 
 fn importBackend(comptime backend: std.builtin.CompilerBackend) type {
@@ -49,10 +62,10 @@ fn importBackend(comptime backend: std.builtin.CompilerBackend) type {
     };
 }
 
-pub fn legalizeFeatures(pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) *const Air.Legalize.Features {
+pub fn legalizeFeatures(pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) ?*const Air.Legalize.Features {
     const zcu = pt.zcu;
     const target = &zcu.navFileScope(nav_index).mod.?.resolved_target.result;
-    return switch (target_util.zigBackend(target.*, zcu.comp.config.use_llvm)) {
+    switch (target_util.zigBackend(target.*, zcu.comp.config.use_llvm)) {
         else => unreachable,
         inline .stage2_llvm,
         .stage2_c,
@@ -65,8 +78,11 @@ pub fn legalizeFeatures(pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) *con
         .stage2_sparc64,
         .stage2_spirv64,
         .stage2_powerpc,
-        => |backend| importBackend(backend).legalizeFeatures(target),
-    };
+        => |backend| {
+            dev.check(devFeatureForBackend(backend));
+            return importBackend(backend).legalizeFeatures(target);
+        },
+    }
 }
 
 pub fn generateFunction(
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index 2d3b236a86..e76c8e069d 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -20,8 +20,8 @@ const Alignment = InternPool.Alignment;
 const BigIntLimb = std.math.big.Limb;
 const BigInt = std.math.big.int;
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 pub const CType = @import("c/Type.zig");
@@ -210,7 +210,6 @@ const reserved_idents = std.StaticStringMap(void).initComptime(.{
     .{ "atomic_ushort", {} },
     .{ "atomic_wchar_t", {} },
     .{ "auto", {} },
-    .{ "bool", {} },
     .{ "break", {} },
     .{ "case", {} },
     .{ "char", {} },
@@ -270,6 +269,11 @@ const reserved_idents = std.StaticStringMap(void).initComptime(.{
     .{ "va_end", {} },
     .{ "va_copy", {} },
 
+    // stdbool.h
+    .{ "bool", {} },
+    .{ "false", {} },
+    .{ "true", {} },
+
     // stddef.h
     .{ "offsetof", {} },
 
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 6f411e7490..77d8f3ff47 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -36,8 +36,8 @@ const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
 const Error = error{ OutOfMemory, CodegenFail };
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 fn subArchName(features: std.Target.Cpu.Feature.Set, arch: anytype, mappings: anytype) ?[]const u8 {
diff --git a/src/codegen/spirv.zig b/src/codegen/spirv.zig
index c51e38ac7e..7c96909751 100644
--- a/src/codegen/spirv.zig
+++ b/src/codegen/spirv.zig
@@ -28,8 +28,8 @@ const SpvAssembler = @import("spirv/Assembler.zig");
 
 const InstMap = std.AutoHashMapUnmanaged(Air.Inst.Index, IdRef);
 
-pub inline fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
-    return comptime &.initEmpty();
+pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+    return null;
 }
 
 pub const zig_call_abi_ver = 3;
diff --git a/src/dev.zig b/src/dev.zig
index 473cb4a8d0..019f18daeb 100644
--- a/src/dev.zig
+++ b/src/dev.zig
@@ -67,6 +67,7 @@ pub const Env = enum {
                 .incremental,
                 .ast_gen,
                 .sema,
+                .legalize,
                 .llvm_backend,
                 .c_backend,
                 .wasm_backend,
@@ -144,6 +145,7 @@ pub const Env = enum {
                 .build_command,
                 .stdio_listen,
                 .incremental,
+                .legalize,
                 .x86_64_backend,
                 .elf_linker,
                 => true,
@@ -222,6 +224,7 @@ pub const Feature = enum {
     incremental,
     ast_gen,
     sema,
+    legalize,
 
     llvm_backend,
     c_backend,
-- 
cgit v1.2.3


From 4c4dacf81a5da85a1f7d1550ed45f5cb20fd1524 Mon Sep 17 00:00:00 2001
From: mlugg <mlugg@mlugg.co.uk>
Date: Fri, 30 May 2025 17:12:11 +0100
Subject: Legalize: replace `safety_checked_instructions`

This adds 4 `Legalize.Feature`s:
* `expand_intcast_safe`
* `expand_add_safe`
* `expand_sub_safe`
* `expand_mul_safe`

These do pretty much what they say on the tin. This logic was previously
in Sema, used when `Zcu.Feature.safety_checked_instructions` was not
supported by the backend. That `Zcu.Feature` has been removed in favour
of this legalization.
---
 src/Air.zig                  |   6 -
 src/Air/Legalize.zig         | 586 +++++++++++++++++++++++++++++++++++++------
 src/Sema.zig                 | 157 +-----------
 src/Zcu.zig                  |   9 -
 src/Zcu/PerThread.zig        |  15 ++
 src/arch/riscv64/CodeGen.zig |   7 +-
 src/arch/wasm/CodeGen.zig    |   7 +-
 src/arch/x86_64/CodeGen.zig  |   4 +
 src/codegen/spirv.zig        |   7 +-
 src/target.zig               |   4 -
 10 files changed, 558 insertions(+), 244 deletions(-)

(limited to 'src/codegen')

diff --git a/src/Air.zig b/src/Air.zig
index 6fce4d19f0..dec266db9c 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -50,8 +50,6 @@ pub const Inst = struct {
         /// is the same as both operands.
         /// The panic handler function must be populated before lowering AIR
         /// that contains this instruction.
-        /// This instruction will only be emitted if the backend has the
-        /// feature `safety_checked_instructions`.
         /// Uses the `bin_op` field.
         add_safe,
         /// Float addition. The instruction is allowed to have equal or more
@@ -79,8 +77,6 @@ pub const Inst = struct {
         /// is the same as both operands.
         /// The panic handler function must be populated before lowering AIR
         /// that contains this instruction.
-        /// This instruction will only be emitted if the backend has the
-        /// feature `safety_checked_instructions`.
         /// Uses the `bin_op` field.
         sub_safe,
         /// Float subtraction. The instruction is allowed to have equal or more
@@ -108,8 +104,6 @@ pub const Inst = struct {
         /// is the same as both operands.
         /// The panic handler function must be populated before lowering AIR
         /// that contains this instruction.
-        /// This instruction will only be emitted if the backend has the
-        /// feature `safety_checked_instructions`.
         /// Uses the `bin_op` field.
         mul_safe,
         /// Float multiplication. The instruction is allowed to have equal or more
diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig
index 221e98aebb..20d3fa457e 100644
--- a/src/Air/Legalize.zig
+++ b/src/Air/Legalize.zig
@@ -81,6 +81,19 @@ pub const Feature = enum {
     /// Legalize reduce of a one element vector to a bitcast
     reduce_one_elem_to_bitcast,
 
+    /// Replace `intcast_safe` with an explicit safety check which `call`s the panic function on failure.
+    /// Not compatible with `scalarize_intcast_safe`.
+    expand_intcast_safe,
+    /// Replace `add_safe` with an explicit safety check which `call`s the panic function on failure.
+    /// Not compatible with `scalarize_add_safe`.
+    expand_add_safe,
+    /// Replace `sub_safe` with an explicit safety check which `call`s the panic function on failure.
+    /// Not compatible with `scalarize_sub_safe`.
+    expand_sub_safe,
+    /// Replace `mul_safe` with an explicit safety check which `call`s the panic function on failure.
+    /// Not compatible with `scalarize_mul_safe`.
+    expand_mul_safe,
+
     fn scalarize(tag: Air.Inst.Tag) Feature {
         return switch (tag) {
             else => unreachable,
@@ -205,17 +218,14 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
             .arg,
             => {},
             inline .add,
-            .add_safe,
             .add_optimized,
             .add_wrap,
             .add_sat,
             .sub,
-            .sub_safe,
             .sub_optimized,
             .sub_wrap,
             .sub_sat,
             .mul,
-            .mul_safe,
             .mul_optimized,
             .mul_wrap,
             .mul_sat,
@@ -240,6 +250,27 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                 const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
                 if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op);
             },
+            .add_safe => if (l.features.contains(.expand_add_safe)) {
+                assert(!l.features.contains(.scalarize_add_safe)); // it doesn't make sense to do both
+                continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .add_with_overflow));
+            } else if (l.features.contains(.scalarize_add_safe)) {
+                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
+                if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op);
+            },
+            .sub_safe => if (l.features.contains(.expand_sub_safe)) {
+                assert(!l.features.contains(.scalarize_sub_safe)); // it doesn't make sense to do both
+                continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .sub_with_overflow));
+            } else if (l.features.contains(.scalarize_sub_safe)) {
+                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
+                if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op);
+            },
+            .mul_safe => if (l.features.contains(.expand_mul_safe)) {
+                assert(!l.features.contains(.scalarize_mul_safe)); // it doesn't make sense to do both
+                continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .mul_with_overflow));
+            } else if (l.features.contains(.scalarize_mul_safe)) {
+                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
+                if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op);
+            },
             .ptr_add,
             .ptr_sub,
             .add_with_overflow,
@@ -295,7 +326,6 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
             .fptrunc,
             .fpext,
             .intcast,
-            .intcast_safe,
             .trunc,
             .int_from_float,
             .int_from_float_optimized,
@@ -312,6 +342,13 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                 if (to_ty.isVector(zcu) and from_ty.isVector(zcu) and to_ty.vectorLen(zcu) == from_ty.vectorLen(zcu))
                     continue :inst try l.scalarize(inst, .ty_op);
             },
+            .intcast_safe => if (l.features.contains(.expand_intcast_safe)) {
+                assert(!l.features.contains(.scalarize_intcast_safe)); // it doesn't make sense to do both
+                continue :inst l.replaceInst(inst, .block, try l.safeIntcastBlockPayload(inst));
+            } else if (l.features.contains(.scalarize_intcast_safe)) {
+                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
+                if (ty_op.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .ty_op);
+            },
             .block,
             .loop,
             => {
@@ -550,81 +587,83 @@ fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, comptime data_
     const expected_instructions_len = l.air_instructions.len + (6 + arity + 8);
     try l.air_instructions.ensureTotalCapacity(gpa, expected_instructions_len);
 
-    var res_block: Block(4) = .empty;
+    var res_block_buf: [4]Air.Inst.Index = undefined;
+    var res_block: Block = .init(&res_block_buf);
     {
-        const res_alloc_inst = res_block.add(l.addInstAssumeCapacity(.{
+        const res_alloc_inst = res_block.add(l, .{
             .tag = .alloc,
             .data = .{ .ty = try pt.singleMutPtrType(res_ty) },
-        }));
-        const index_alloc_inst = res_block.add(l.addInstAssumeCapacity(.{
+        });
+        const index_alloc_inst = res_block.add(l, .{
             .tag = .alloc,
             .data = .{ .ty = .ptr_usize },
-        }));
-        _ = res_block.add(l.addInstAssumeCapacity(.{
+        });
+        _ = res_block.add(l, .{
             .tag = .store,
             .data = .{ .bin_op = .{
                 .lhs = index_alloc_inst.toRef(),
                 .rhs = .zero_usize,
             } },
-        }));
+        });
 
         const loop_inst: Air.Inst.Index = @enumFromInt(l.air_instructions.len + (3 + arity + 7));
-        var loop_block: Block(3 + arity + 2) = .empty;
+        var loop_block_buf: [3 + arity + 2]Air.Inst.Index = undefined;
+        var loop_block: Block = .init(&loop_block_buf);
         {
-            const cur_index_inst = loop_block.add(l.addInstAssumeCapacity(.{
+            const cur_index_inst = loop_block.add(l, .{
                 .tag = .load,
                 .data = .{ .ty_op = .{
                     .ty = .usize_type,
                     .operand = index_alloc_inst.toRef(),
                 } },
-            }));
-            _ = loop_block.add(l.addInstAssumeCapacity(.{
+            });
+            _ = loop_block.add(l, .{
                 .tag = .vector_store_elem,
                 .data = .{ .vector_store_elem = .{
                     .vector_ptr = res_alloc_inst.toRef(),
                     .payload = try l.addExtra(Air.Bin, .{
                         .lhs = cur_index_inst.toRef(),
-                        .rhs = loop_block.add(l.addInstAssumeCapacity(res_elem: switch (data_tag) {
+                        .rhs = loop_block.add(l, res_elem: switch (data_tag) {
                             .un_op => .{
                                 .tag = orig.tag,
-                                .data = .{ .un_op = loop_block.add(l.addInstAssumeCapacity(.{
+                                .data = .{ .un_op = loop_block.add(l, .{
                                     .tag = .array_elem_val,
                                     .data = .{ .bin_op = .{
                                         .lhs = orig.data.un_op,
                                         .rhs = cur_index_inst.toRef(),
                                     } },
-                                })).toRef() },
+                                }).toRef() },
                             },
                             .ty_op => .{
                                 .tag = orig.tag,
                                 .data = .{ .ty_op = .{
                                     .ty = Air.internedToRef(orig.data.ty_op.ty.toType().scalarType(zcu).toIntern()),
-                                    .operand = loop_block.add(l.addInstAssumeCapacity(.{
+                                    .operand = loop_block.add(l, .{
                                         .tag = .array_elem_val,
                                         .data = .{ .bin_op = .{
                                             .lhs = orig.data.ty_op.operand,
                                             .rhs = cur_index_inst.toRef(),
                                         } },
-                                    })).toRef(),
+                                    }).toRef(),
                                 } },
                             },
                             .bin_op => .{
                                 .tag = orig.tag,
                                 .data = .{ .bin_op = .{
-                                    .lhs = loop_block.add(l.addInstAssumeCapacity(.{
+                                    .lhs = loop_block.add(l, .{
                                         .tag = .array_elem_val,
                                         .data = .{ .bin_op = .{
                                             .lhs = orig.data.bin_op.lhs,
                                             .rhs = cur_index_inst.toRef(),
                                         } },
-                                    })).toRef(),
-                                    .rhs = loop_block.add(l.addInstAssumeCapacity(.{
+                                    }).toRef(),
+                                    .rhs = loop_block.add(l, .{
                                         .tag = .array_elem_val,
                                         .data = .{ .bin_op = .{
                                             .lhs = orig.data.bin_op.rhs,
                                             .rhs = cur_index_inst.toRef(),
                                         } },
-                                    })).toRef(),
+                                    }).toRef(),
                                 } },
                             },
                             .ty_pl_vector_cmp => {
@@ -650,20 +689,20 @@ fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, comptime data_
                                         },
                                     },
                                     .data = .{ .bin_op = .{
-                                        .lhs = loop_block.add(l.addInstAssumeCapacity(.{
+                                        .lhs = loop_block.add(l, .{
                                             .tag = .array_elem_val,
                                             .data = .{ .bin_op = .{
                                                 .lhs = extra.lhs,
                                                 .rhs = cur_index_inst.toRef(),
                                             } },
-                                        })).toRef(),
-                                        .rhs = loop_block.add(l.addInstAssumeCapacity(.{
+                                        }).toRef(),
+                                        .rhs = loop_block.add(l, .{
                                             .tag = .array_elem_val,
                                             .data = .{ .bin_op = .{
                                                 .lhs = extra.rhs,
                                                 .rhs = cur_index_inst.toRef(),
                                             } },
-                                        })).toRef(),
+                                        }).toRef(),
                                     } },
                                 };
                             },
@@ -673,94 +712,96 @@ fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, comptime data_
                                     .tag = orig.tag,
                                     .data = .{ .pl_op = .{
                                         .payload = try l.addExtra(Air.Bin, .{
-                                            .lhs = loop_block.add(l.addInstAssumeCapacity(.{
+                                            .lhs = loop_block.add(l, .{
                                                 .tag = .array_elem_val,
                                                 .data = .{ .bin_op = .{
                                                     .lhs = extra.lhs,
                                                     .rhs = cur_index_inst.toRef(),
                                                 } },
-                                            })).toRef(),
-                                            .rhs = loop_block.add(l.addInstAssumeCapacity(.{
+                                            }).toRef(),
+                                            .rhs = loop_block.add(l, .{
                                                 .tag = .array_elem_val,
                                                 .data = .{ .bin_op = .{
                                                     .lhs = extra.rhs,
                                                     .rhs = cur_index_inst.toRef(),
                                                 } },
-                                            })).toRef(),
+                                            }).toRef(),
                                         }),
-                                        .operand = loop_block.add(l.addInstAssumeCapacity(.{
+                                        .operand = loop_block.add(l, .{
                                             .tag = .array_elem_val,
                                             .data = .{ .bin_op = .{
                                                 .lhs = orig.data.pl_op.operand,
                                                 .rhs = cur_index_inst.toRef(),
                                             } },
-                                        })).toRef(),
+                                        }).toRef(),
                                     } },
                                 };
                             },
-                        })).toRef(),
+                        }).toRef(),
                     }),
                 } },
-            }));
-            const not_done_inst = loop_block.add(l.addInstAssumeCapacity(.{
+            });
+            const not_done_inst = loop_block.add(l, .{
                 .tag = .cmp_lt,
                 .data = .{ .bin_op = .{
                     .lhs = cur_index_inst.toRef(),
                     .rhs = try pt.intRef(.usize, res_ty.vectorLen(zcu) - 1),
                 } },
-            }));
+            });
 
-            var not_done_block: Block(3) = .empty;
+            var not_done_block_buf: [3]Air.Inst.Index = undefined;
+            var not_done_block: Block = .init(&not_done_block_buf);
             {
-                _ = not_done_block.add(l.addInstAssumeCapacity(.{
+                _ = not_done_block.add(l, .{
                     .tag = .store,
                     .data = .{ .bin_op = .{
                         .lhs = index_alloc_inst.toRef(),
-                        .rhs = not_done_block.add(l.addInstAssumeCapacity(.{
+                        .rhs = not_done_block.add(l, .{
                             .tag = .add,
                             .data = .{ .bin_op = .{
                                 .lhs = cur_index_inst.toRef(),
                                 .rhs = .one_usize,
                             } },
-                        })).toRef(),
+                        }).toRef(),
                     } },
-                }));
-                _ = not_done_block.add(l.addInstAssumeCapacity(.{
+                });
+                _ = not_done_block.add(l, .{
                     .tag = .repeat,
                     .data = .{ .repeat = .{ .loop_inst = loop_inst } },
-                }));
+                });
             }
-            var done_block: Block(2) = .empty;
+            var done_block_buf: [2]Air.Inst.Index = undefined;
+            var done_block: Block = .init(&done_block_buf);
             {
-                _ = done_block.add(l.addInstAssumeCapacity(.{
+                _ = done_block.add(l, .{
                     .tag = .br,
                     .data = .{ .br = .{
                         .block_inst = orig_inst,
-                        .operand = done_block.add(l.addInstAssumeCapacity(.{
+                        .operand = done_block.add(l, .{
                             .tag = .load,
                             .data = .{ .ty_op = .{
                                 .ty = Air.internedToRef(res_ty.toIntern()),
                                 .operand = res_alloc_inst.toRef(),
                             } },
-                        })).toRef(),
+                        }).toRef(),
                     } },
-                }));
+                });
             }
-            _ = loop_block.add(l.addInstAssumeCapacity(.{
+            _ = loop_block.add(l, .{
                 .tag = .cond_br,
                 .data = .{ .pl_op = .{
                     .operand = not_done_inst.toRef(),
                     .payload = try l.addCondBrBodies(not_done_block.body(), done_block.body()),
                 } },
-            }));
+            });
         }
-        assert(loop_inst == res_block.add(l.addInstAssumeCapacity(.{
+        assert(loop_inst == res_block.add(l, .{
             .tag = .loop,
             .data = .{ .ty_pl = .{
                 .ty = .noreturn_type,
                 .payload = try l.addBlockBody(loop_block.body()),
             } },
-        })));
+        }));
     }
     assert(l.air_instructions.len == expected_instructions_len);
     return .{ .ty_pl = .{
@@ -768,29 +809,423 @@ fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, comptime data_
         .payload = try l.addBlockBody(res_block.body()),
     } };
 }
+fn safeIntcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
+    const pt = l.pt;
+    const zcu = pt.zcu;
+    const gpa = zcu.gpa;
+    const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
+
+    const operand_ref = ty_op.operand;
+    const operand_ty = l.typeOf(operand_ref);
+    const dest_ty = ty_op.ty.toType();
+
+    const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
+    const operand_scalar_ty = operand_ty.scalarType(zcu);
+    const dest_scalar_ty = dest_ty.scalarType(zcu);
+
+    assert(operand_scalar_ty.zigTypeTag(zcu) == .int);
+    const dest_is_enum = switch (dest_scalar_ty.zigTypeTag(zcu)) {
+        .int => false,
+        .@"enum" => true,
+        else => unreachable,
+    };
+
+    const operand_info = operand_scalar_ty.intInfo(zcu);
+    const dest_info = dest_scalar_ty.intInfo(zcu);
+
+    const have_min_check, const have_max_check = c: {
+        const dest_pos_bits = dest_info.bits - @intFromBool(dest_info.signedness == .signed);
+        const operand_pos_bits = operand_info.bits - @intFromBool(operand_info.signedness == .signed);
+        const dest_allows_neg = dest_info.signedness == .signed and dest_info.bits > 0;
+        const operand_allows_neg = operand_info.signedness == .signed and operand_info.bits > 0;
+        break :c .{
+            operand_allows_neg and (!dest_allows_neg or dest_info.bits < operand_info.bits),
+            dest_pos_bits < operand_pos_bits,
+        };
+    };
+
+    // The worst-case scenario in terms of total instructions and total condbrs is the case where
+    // the result type is an exhaustive enum whose tag type is smaller than the operand type:
+    //
+    // %x = block({
+    //   %1 = cmp_lt(%y, @min_allowed_int)
+    //   %2 = cmp_gt(%y, @max_allowed_int)
+    //   %3 = bool_or(%1, %2)
+    //   %4 = cond_br(%3, {
+    //     %5 = call(@panic.invalidEnumValue, [])
+    //     %6 = unreach()
+    //   }, {
+    //     %7 = intcast(@res_ty, %y)
+    //     %8 = is_named_enum_value(%7)
+    //     %9 = cond_br(%8, {
+    //       %10 = br(%x, %7)
+    //     }, {
+    //       %11 = call(@panic.invalidEnumValue, [])
+    //       %12 = unreach()
+    //     })
+    //   })
+    // })
+    //
+    // Note that vectors of enums don't exist -- the worst case for vectors is this:
+    //
+    // %x = block({
+    //   %1 = cmp_lt(%y, @min_allowed_int)
+    //   %2 = cmp_gt(%y, @max_allowed_int)
+    //   %3 = bool_or(%1, %2)
+    //   %4 = reduce(%3, .@"or")
+    //   %5 = cond_br(%4, {
+    //     %6 = call(@panic.invalidEnumValue, [])
+    //     %7 = unreach()
+    //   }, {
+    //     %8 = intcast(@res_ty, %y)
+    //     %9 = br(%x, %8)
+    //   })
+    // })
+
+    try l.air_instructions.ensureUnusedCapacity(gpa, 12);
+    var body_inst_buf: [12]Air.Inst.Index = undefined;
+    var condbr_buf: [2]CondBr = undefined;
+    var condbr_idx: usize = 0;
+
+    var main_block: Block = .init(&body_inst_buf);
+    var cur_block: *Block = &main_block;
+
+    const panic_id: Zcu.SimplePanicId = if (dest_is_enum) .invalid_enum_value else .cast_truncated_data;
+
+    if (have_min_check or have_max_check) {
+        const dest_int_ty = if (dest_is_enum) dest_ty.intTagType(zcu) else dest_ty;
+        const condbr = &condbr_buf[condbr_idx];
+        condbr_idx += 1;
+        const below_min_inst: Air.Inst.Index = if (have_min_check) inst: {
+            const min_val_ref = Air.internedToRef((try dest_int_ty.minInt(pt, operand_ty)).toIntern());
+            break :inst try cur_block.addCmp(l, is_vector, .lt, operand_ref, min_val_ref);
+        } else undefined;
+        const above_max_inst: Air.Inst.Index = if (have_max_check) inst: {
+            const max_val_ref = Air.internedToRef((try dest_int_ty.maxInt(pt, operand_ty)).toIntern());
+            break :inst try cur_block.addCmp(l, is_vector, .gt, operand_ref, max_val_ref);
+        } else undefined;
+        const out_of_range_inst: Air.Inst.Index = inst: {
+            if (have_min_check and have_max_check) break :inst cur_block.add(l, .{
+                .tag = .bool_or,
+                .data = .{ .bin_op = .{
+                    .lhs = below_min_inst.toRef(),
+                    .rhs = above_max_inst.toRef(),
+                } },
+            });
+            if (have_min_check) break :inst below_min_inst;
+            if (have_max_check) break :inst above_max_inst;
+            unreachable;
+        };
+        const scalar_out_of_range_inst: Air.Inst.Index = if (is_vector) cur_block.add(l, .{
+            .tag = .reduce,
+            .data = .{ .reduce = .{
+                .operand = out_of_range_inst.toRef(),
+                .operation = .Or,
+            } },
+        }) else out_of_range_inst;
+        condbr.* = .init(l, scalar_out_of_range_inst.toRef(), cur_block, .{
+            .true = .cold,
+            .false = .none,
+        });
+        condbr.then_block = .init(cur_block.stealRemainingCapacity());
+        try condbr.then_block.addPanic(l, panic_id);
+        condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
+        cur_block = &condbr.else_block;
+    }
+
+    // Now we know we're in-range, we can intcast:
+    const cast_inst = cur_block.add(l, .{
+        .tag = .intcast,
+        .data = .{ .ty_op = .{
+            .ty = Air.internedToRef(dest_ty.toIntern()),
+            .operand = operand_ref,
+        } },
+    });
+    // For ints we're already done, but for exhaustive enums we must check this is a valid tag.
+    if (dest_is_enum and !dest_ty.isNonexhaustiveEnum(zcu) and zcu.backendSupportsFeature(.is_named_enum_value)) {
+        assert(!is_vector); // vectors of enums don't exist
+        // We are building this:
+        //   %1 = is_named_enum_value(%cast_inst)
+        //   %2 = cond_br(%1, {
+        //     <new cursor>
+        //   }, {
+        //     <panic>
+        //   })
+        const is_named_inst = cur_block.add(l, .{
+            .tag = .is_named_enum_value,
+            .data = .{ .un_op = cast_inst.toRef() },
+        });
+        const condbr = &condbr_buf[condbr_idx];
+        condbr_idx += 1;
+        condbr.* = .init(l, is_named_inst.toRef(), cur_block, .{
+            .true = .none,
+            .false = .cold,
+        });
+        condbr.else_block = .init(cur_block.stealRemainingCapacity());
+        try condbr.else_block.addPanic(l, panic_id);
+        condbr.then_block = .init(condbr.else_block.stealRemainingCapacity());
+        cur_block = &condbr.then_block;
+    }
+    // Finally, just `br` to our outer `block`.
+    _ = cur_block.add(l, .{
+        .tag = .br,
+        .data = .{ .br = .{
+            .block_inst = orig_inst,
+            .operand = cast_inst.toRef(),
+        } },
+    });
+    // We might not have used all of the instructions; that's intentional.
+    _ = cur_block.stealRemainingCapacity();
+
+    for (condbr_buf[0..condbr_idx]) |*condbr| try condbr.finish(l);
+    return .{ .ty_pl = .{
+        .ty = Air.internedToRef(dest_ty.toIntern()),
+        .payload = try l.addBlockBody(main_block.body()),
+    } };
+}
+fn safeArithmeticBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, overflow_op_tag: Air.Inst.Tag) Error!Air.Inst.Data {
+    const pt = l.pt;
+    const zcu = pt.zcu;
+    const gpa = zcu.gpa;
+    const bin_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].bin_op;
+
+    const operand_ty = l.typeOf(bin_op.lhs);
+    assert(l.typeOf(bin_op.rhs).toIntern() == operand_ty.toIntern());
+    const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
+
+    const overflow_tuple_ty = try pt.overflowArithmeticTupleType(operand_ty);
+    const overflow_bits_ty = overflow_tuple_ty.fieldType(1, zcu);
+
+    // The worst-case scenario is a vector operand:
+    //
+    // %1 = add_with_overflow(%x, %y)
+    // %2 = struct_field_val(%1, .@"1")
+    // %3 = reduce(%2, .@"or")
+    // %4 = bitcast(%3, @bool_type)
+    // %5 = cond_br(%4, {
+    //   %6 = call(@panic.integerOverflow, [])
+    //   %7 = unreach()
+    // }, {
+    //   %8 = struct_field_val(%1, .@"0")
+    //   %9 = br(%z, %8)
+    // })
+    try l.air_instructions.ensureUnusedCapacity(gpa, 9);
+    var body_inst_buf: [9]Air.Inst.Index = undefined;
+
+    var main_block: Block = .init(&body_inst_buf);
+
+    const overflow_op_inst = main_block.add(l, .{
+        .tag = overflow_op_tag,
+        .data = .{ .ty_pl = .{
+            .ty = Air.internedToRef(overflow_tuple_ty.toIntern()),
+            .payload = try l.addExtra(Air.Bin, .{
+                .lhs = bin_op.lhs,
+                .rhs = bin_op.rhs,
+            }),
+        } },
+    });
+    const overflow_bits_inst = main_block.add(l, .{
+        .tag = .struct_field_val,
+        .data = .{ .ty_pl = .{
+            .ty = Air.internedToRef(overflow_bits_ty.toIntern()),
+            .payload = try l.addExtra(Air.StructField, .{
+                .struct_operand = overflow_op_inst.toRef(),
+                .field_index = 1,
+            }),
+        } },
+    });
+    const any_overflow_bit_inst = if (is_vector) main_block.add(l, .{
+        .tag = .reduce,
+        .data = .{ .reduce = .{
+            .operand = overflow_bits_inst.toRef(),
+            .operation = .Or,
+        } },
+    }) else overflow_bits_inst;
+    const any_overflow_inst = try main_block.addCmp(l, false, .eq, any_overflow_bit_inst.toRef(), .one_u1);
+
+    var condbr: CondBr = .init(l, any_overflow_inst.toRef(), &main_block, .{
+        .true = .cold,
+        .false = .none,
+    });
+    condbr.then_block = .init(main_block.stealRemainingCapacity());
+    try condbr.then_block.addPanic(l, .integer_overflow);
+    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
+
+    const result_inst = condbr.else_block.add(l, .{
+        .tag = .struct_field_val,
+        .data = .{ .ty_pl = .{
+            .ty = Air.internedToRef(operand_ty.toIntern()),
+            .payload = try l.addExtra(Air.StructField, .{
+                .struct_operand = overflow_op_inst.toRef(),
+                .field_index = 0,
+            }),
+        } },
+    });
+    _ = condbr.else_block.add(l, .{
+        .tag = .br,
+        .data = .{ .br = .{
+            .block_inst = orig_inst,
+            .operand = result_inst.toRef(),
+        } },
+    });
+    // We might not have used all of the instructions; that's intentional.
+    _ = condbr.else_block.stealRemainingCapacity();
+
+    try condbr.finish(l);
+    return .{ .ty_pl = .{
+        .ty = Air.internedToRef(operand_ty.toIntern()),
+        .payload = try l.addBlockBody(main_block.body()),
+    } };
+}
 
-fn Block(comptime capacity: usize) type {
-    return struct {
-        instructions: [capacity]Air.Inst.Index,
-        len: usize,
+const Block = struct {
+    instructions: []Air.Inst.Index,
+    len: usize,
 
-        const empty: @This() = .{
-            .instructions = undefined,
+    /// There are two common usages of the API:
+    /// * `buf.len` is exactly the number of instructions which will be in this block
+    /// * `buf.len` is no smaller than necessary, and `b.stealRemainingCapacity` will be used
+    fn init(buf: []Air.Inst.Index) Block {
+        return .{
+            .instructions = buf,
             .len = 0,
         };
+    }
 
-        fn add(b: *@This(), inst: Air.Inst.Index) Air.Inst.Index {
-            b.instructions[b.len] = inst;
-            b.len += 1;
-            return inst;
+    /// Like `Legalize.addInstAssumeCapacity`, but also appends the instruction to `b`.
+    fn add(b: *Block, l: *Legalize, inst_data: Air.Inst) Air.Inst.Index {
+        const inst = l.addInstAssumeCapacity(inst_data);
+        b.instructions[b.len] = inst;
+        b.len += 1;
+        return inst;
+    }
+
+    /// Adds the code to call the panic handler `panic_id`. This is usually `.call` then `.unreach`,
+    /// but if `Zcu.Feature.panic_fn` is unsupported, we lower to `.trap` instead.
+    fn addPanic(b: *Block, l: *Legalize, panic_id: Zcu.SimplePanicId) Error!void {
+        const zcu = l.pt.zcu;
+        if (!zcu.backendSupportsFeature(.panic_fn)) {
+            _ = b.add(l, .{
+                .tag = .trap,
+                .data = .{ .no_op = {} },
+            });
+            return;
         }
+        const panic_fn_val = zcu.builtin_decl_values.get(panic_id.toBuiltin());
+        _ = b.add(l, .{
+            .tag = .call,
+            .data = .{ .pl_op = .{
+                .operand = Air.internedToRef(panic_fn_val),
+                .payload = try l.addExtra(Air.Call, .{ .args_len = 0 }),
+            } },
+        });
+        _ = b.add(l, .{
+            .tag = .unreach,
+            .data = .{ .no_op = {} },
+        });
+    }
 
-        fn body(b: *const @This()) []const Air.Inst.Index {
-            assert(b.len == b.instructions.len);
-            return &b.instructions;
+    /// Adds a `cmp_*` instruction (including maybe `cmp_vector`) to `b`. This is a fairly thin wrapper
+    /// around `add`, although it does compute the result type if `is_vector` (`@Vector(n, bool)`).
+    fn addCmp(
+        b: *Block,
+        l: *Legalize,
+        is_vector: bool,
+        op: std.math.CompareOperator,
+        lhs: Air.Inst.Ref,
+        rhs: Air.Inst.Ref,
+    ) Error!Air.Inst.Index {
+        const pt = l.pt;
+        if (is_vector) {
+            const bool_vec_ty = try pt.vectorType(.{
+                .child = .bool_type,
+                .len = l.typeOf(lhs).vectorLen(pt.zcu),
+            });
+            return b.add(l, .{
+                .tag = .cmp_vector,
+                .data = .{ .ty_pl = .{
+                    .ty = Air.internedToRef(bool_vec_ty.toIntern()),
+                    .payload = try l.addExtra(Air.VectorCmp, .{
+                        .lhs = lhs,
+                        .rhs = rhs,
+                        .op = Air.VectorCmp.encodeOp(op),
+                    }),
+                } },
+            });
         }
+        return b.add(l, .{
+            .tag = switch (op) {
+                .lt => .cmp_lt,
+                .lte => .cmp_lte,
+                .eq => .cmp_eq,
+                .gte => .cmp_gte,
+                .gt => .cmp_gt,
+                .neq => .cmp_neq,
+            },
+            .data = .{ .bin_op = .{
+                .lhs = lhs,
+                .rhs = rhs,
+            } },
+        });
+    }
+
+    /// Returns the unused capacity of `b.instructions`, and shrinks `b.instructions` down to `b.len`.
+    /// This is useful when you've provided a buffer big enough for all your instructions, but you are
+    /// now starting a new block and some of them need to live there instead.
+    fn stealRemainingCapacity(b: *Block) []Air.Inst.Index {
+        const remaining = b.instructions[b.len..];
+        b.instructions = b.instructions[0..b.len];
+        return remaining;
+    }
+
+    fn body(b: *const Block) []const Air.Inst.Index {
+        assert(b.len == b.instructions.len);
+        return b.instructions;
+    }
+};
+
+const CondBr = struct {
+    inst: Air.Inst.Index,
+    hints: BranchHints,
+    then_block: Block,
+    else_block: Block,
+
+    const BranchHints = struct {
+        true: std.builtin.BranchHint,
+        false: std.builtin.BranchHint,
     };
-}
+
+    /// The return value has `then_block` and `else_block` initialized to `undefined`; it is the
+    /// caller's reponsibility to initialize them.
+    fn init(l: *Legalize, operand: Air.Inst.Ref, parent_block: *Block, hints: BranchHints) CondBr {
+        return .{
+            .inst = parent_block.add(l, .{
+                .tag = .cond_br,
+                .data = .{ .pl_op = .{
+                    .operand = operand,
+                    .payload = undefined,
+                } },
+            }),
+            .hints = hints,
+            .then_block = undefined,
+            .else_block = undefined,
+        };
+    }
+
+    fn finish(cond_br: CondBr, l: *Legalize) Error!void {
+        const data = &l.air_instructions.items(.data)[@intFromEnum(cond_br.inst)];
+        data.pl_op.payload = try l.addCondBrBodiesHints(
+            cond_br.then_block.body(),
+            cond_br.else_block.body(),
+            .{
+                .true = cond_br.hints.true,
+                .false = cond_br.hints.false,
+                .then_cov = .none,
+                .else_cov = .none,
+            },
+        );
+    }
+};
 
 fn addInstAssumeCapacity(l: *Legalize, inst: Air.Inst) Air.Inst.Index {
     defer l.air_instructions.appendAssumeCapacity(inst);
@@ -818,17 +1253,20 @@ fn addBlockBody(l: *Legalize, body: []const Air.Inst.Index) Error!u32 {
 }
 
 fn addCondBrBodies(l: *Legalize, then_body: []const Air.Inst.Index, else_body: []const Air.Inst.Index) Error!u32 {
+    return l.addCondBrBodiesHints(then_body, else_body, .{
+        .true = .none,
+        .false = .none,
+        .then_cov = .none,
+        .else_cov = .none,
+    });
+}
+fn addCondBrBodiesHints(l: *Legalize, then_body: []const Air.Inst.Index, else_body: []const Air.Inst.Index, hints: Air.CondBr.BranchHints) Error!u32 {
     try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, 3 + then_body.len + else_body.len);
     defer {
         l.air_extra.appendSliceAssumeCapacity(&.{
             @intCast(then_body.len),
             @intCast(else_body.len),
-            @bitCast(Air.CondBr.BranchHints{
-                .true = .none,
-                .false = .none,
-                .then_cov = .none,
-                .else_cov = .none,
-            }),
+            @bitCast(hints),
         });
         l.air_extra.appendSliceAssumeCapacity(@ptrCast(then_body));
         l.air_extra.appendSliceAssumeCapacity(@ptrCast(else_body));
diff --git a/src/Sema.zig b/src/Sema.zig
index c18f71d1fa..34fb468716 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -8912,21 +8912,10 @@ fn zirEnumFromInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
 
     try sema.requireRuntimeBlock(block, src, operand_src);
     if (block.wantSafety()) {
-        if (zcu.backendSupportsFeature(.safety_checked_instructions)) {
+        if (zcu.backendSupportsFeature(.panic_fn)) {
             _ = try sema.preparePanicId(src, .invalid_enum_value);
-            return block.addTyOp(.intcast_safe, dest_ty, operand);
-        } else {
-            // Slightly silly fallback case...
-            const int_tag_ty = dest_ty.intTagType(zcu);
-            // Use `intCast`, since it'll set up the Sema-emitted safety checks for us!
-            const int_val = try sema.intCast(block, src, int_tag_ty, src, operand, src, true, true);
-            const result = try block.addBitCast(dest_ty, int_val);
-            if (!dest_ty.isNonexhaustiveEnum(zcu) and zcu.backendSupportsFeature(.is_named_enum_value)) {
-                const ok = try block.addUnOp(.is_named_enum_value, result);
-                try sema.addSafetyCheck(block, src, ok, .invalid_enum_value);
-            }
-            return result;
         }
+        return block.addTyOp(.intcast_safe, dest_ty, operand);
     }
     return block.addTyOp(.intcast, dest_ty, operand);
 }
@@ -10331,90 +10320,11 @@ fn intCast(
 
     try sema.requireRuntimeBlock(block, src, operand_src);
     if (runtime_safety and block.wantSafety()) {
-        if (zcu.backendSupportsFeature(.safety_checked_instructions)) {
+        if (zcu.backendSupportsFeature(.panic_fn)) {
             _ = try sema.preparePanicId(src, .negative_to_unsigned);
             _ = try sema.preparePanicId(src, .cast_truncated_data);
-            return block.addTyOp(.intcast_safe, dest_ty, operand);
-        }
-        const actual_info = operand_scalar_ty.intInfo(zcu);
-        const wanted_info = dest_scalar_ty.intInfo(zcu);
-        const actual_bits = actual_info.bits;
-        const wanted_bits = wanted_info.bits;
-        const actual_value_bits = actual_bits - @intFromBool(actual_info.signedness == .signed);
-        const wanted_value_bits = wanted_bits - @intFromBool(wanted_info.signedness == .signed);
-
-        // range shrinkage
-        // requirement: int value fits into target type
-        if (wanted_value_bits < actual_value_bits) {
-            const dest_max_val_scalar = try dest_scalar_ty.maxIntScalar(pt, operand_scalar_ty);
-            const dest_max_val = try sema.splat(operand_ty, dest_max_val_scalar);
-            const dest_max = Air.internedToRef(dest_max_val.toIntern());
-
-            if (actual_info.signedness == .signed) {
-                const diff = try block.addBinOp(.sub_wrap, dest_max, operand);
-
-                // Reinterpret the sign-bit as part of the value. This will make
-                // negative differences (`operand` > `dest_max`) appear too big.
-                const unsigned_scalar_operand_ty = try pt.intType(.unsigned, actual_bits);
-                const unsigned_operand_ty = if (is_vector) try pt.vectorType(.{
-                    .len = dest_ty.vectorLen(zcu),
-                    .child = unsigned_scalar_operand_ty.toIntern(),
-                }) else unsigned_scalar_operand_ty;
-                const diff_unsigned = try block.addBitCast(unsigned_operand_ty, diff);
-
-                // If the destination type is signed, then we need to double its
-                // range to account for negative values.
-                const dest_range_val = if (wanted_info.signedness == .signed) range_val: {
-                    const one_scalar = try pt.intValue(unsigned_scalar_operand_ty, 1);
-                    const one = if (is_vector) Value.fromInterned(try pt.intern(.{ .aggregate = .{
-                        .ty = unsigned_operand_ty.toIntern(),
-                        .storage = .{ .repeated_elem = one_scalar.toIntern() },
-                    } })) else one_scalar;
-                    const range_minus_one = try dest_max_val.shl(one, unsigned_operand_ty, sema.arena, pt);
-                    const result = try arith.addWithOverflow(sema, unsigned_operand_ty, range_minus_one, one);
-                    assert(result.overflow_bit.compareAllWithZero(.eq, zcu));
-                    break :range_val result.wrapped_result;
-                } else try pt.getCoerced(dest_max_val, unsigned_operand_ty);
-                const dest_range = Air.internedToRef(dest_range_val.toIntern());
-
-                const ok = if (is_vector) ok: {
-                    const is_in_range = try block.addCmpVector(diff_unsigned, dest_range, .lte);
-                    const all_in_range = try block.addReduce(is_in_range, .And);
-                    break :ok all_in_range;
-                } else ok: {
-                    const is_in_range = try block.addBinOp(.cmp_lte, diff_unsigned, dest_range);
-                    break :ok is_in_range;
-                };
-                // TODO negative_to_unsigned?
-                try sema.addSafetyCheck(block, src, ok, if (safety_panics_are_enum) .invalid_enum_value else .cast_truncated_data);
-            } else {
-                const ok = if (is_vector) ok: {
-                    const is_in_range = try block.addCmpVector(operand, dest_max, .lte);
-                    const all_in_range = try block.addReduce(is_in_range, .And);
-                    break :ok all_in_range;
-                } else ok: {
-                    const is_in_range = try block.addBinOp(.cmp_lte, operand, dest_max);
-                    break :ok is_in_range;
-                };
-                try sema.addSafetyCheck(block, src, ok, if (safety_panics_are_enum) .invalid_enum_value else .cast_truncated_data);
-            }
-        } else if (actual_info.signedness == .signed and wanted_info.signedness == .unsigned) {
-            // no shrinkage, yes sign loss
-            // requirement: signed to unsigned >= 0
-            const ok = if (is_vector) ok: {
-                const scalar_zero = try pt.intValue(operand_scalar_ty, 0);
-                const zero_val = try sema.splat(operand_ty, scalar_zero);
-                const zero_inst = Air.internedToRef(zero_val.toIntern());
-                const is_in_range = try block.addCmpVector(operand, zero_inst, .gte);
-                const all_in_range = try block.addReduce(is_in_range, .And);
-                break :ok all_in_range;
-            } else ok: {
-                const zero_inst = Air.internedToRef((try pt.intValue(operand_ty, 0)).toIntern());
-                const is_in_range = try block.addBinOp(.cmp_gte, operand, zero_inst);
-                break :ok is_in_range;
-            };
-            try sema.addSafetyCheck(block, src, ok, if (safety_panics_are_enum) .invalid_enum_value else .negative_to_unsigned);
         }
+        return block.addTyOp(.intcast_safe, dest_ty, operand);
     }
     return block.addTyOp(.intcast, dest_ty, operand);
 }
@@ -14316,7 +14226,7 @@ fn zirShl(
         }
 
         if (air_tag == .shl_exact) {
-            const op_ov_tuple_ty = try sema.overflowArithmeticTupleType(lhs_ty);
+            const op_ov_tuple_ty = try pt.overflowArithmeticTupleType(lhs_ty);
             const op_ov = try block.addInst(.{
                 .tag = .shl_with_overflow,
                 .data = .{ .ty_pl = .{
@@ -16111,7 +16021,7 @@ fn zirOverflowArithmetic(
     const maybe_lhs_val = try sema.resolveValue(lhs);
     const maybe_rhs_val = try sema.resolveValue(rhs);
 
-    const tuple_ty = try sema.overflowArithmeticTupleType(dest_ty);
+    const tuple_ty = try pt.overflowArithmeticTupleType(dest_ty);
     const overflow_ty: Type = .fromInterned(ip.indexToKey(tuple_ty.toIntern()).tuple_type.types.get(ip)[1]);
 
     var result: struct {
@@ -16284,24 +16194,6 @@ fn splat(sema: *Sema, ty: Type, val: Value) !Value {
     return Value.fromInterned(repeated);
 }
 
-fn overflowArithmeticTupleType(sema: *Sema, ty: Type) !Type {
-    const pt = sema.pt;
-    const zcu = pt.zcu;
-    const ip = &zcu.intern_pool;
-    const ov_ty: Type = if (ty.zigTypeTag(zcu) == .vector) try pt.vectorType(.{
-        .len = ty.vectorLen(zcu),
-        .child = .u1_type,
-    }) else .u1;
-
-    const types = [2]InternPool.Index{ ty.toIntern(), ov_ty.toIntern() };
-    const values = [2]InternPool.Index{ .none, .none };
-    const tuple_ty = try ip.getTupleType(zcu.gpa, pt.tid, .{
-        .types = &types,
-        .values = &values,
-    });
-    return .fromInterned(tuple_ty);
-}
-
 fn analyzeArithmetic(
     sema: *Sema,
     block: *Block,
@@ -16477,41 +16369,10 @@ fn analyzeArithmetic(
     }
 
     if (block.wantSafety() and want_safety and scalar_tag == .int) {
-        if (zcu.backendSupportsFeature(.safety_checked_instructions)) {
-            if (air_tag != air_tag_safe) {
-                _ = try sema.preparePanicId(src, .integer_overflow);
-            }
-            return block.addBinOp(air_tag_safe, casted_lhs, casted_rhs);
-        } else {
-            const maybe_op_ov: ?Air.Inst.Tag = switch (air_tag) {
-                .add => .add_with_overflow,
-                .sub => .sub_with_overflow,
-                .mul => .mul_with_overflow,
-                else => null,
-            };
-            if (maybe_op_ov) |op_ov_tag| {
-                const op_ov_tuple_ty = try sema.overflowArithmeticTupleType(resolved_type);
-                const op_ov = try block.addInst(.{
-                    .tag = op_ov_tag,
-                    .data = .{ .ty_pl = .{
-                        .ty = Air.internedToRef(op_ov_tuple_ty.toIntern()),
-                        .payload = try sema.addExtra(Air.Bin{
-                            .lhs = casted_lhs,
-                            .rhs = casted_rhs,
-                        }),
-                    } },
-                });
-                const ov_bit = try sema.tupleFieldValByIndex(block, op_ov, 1, op_ov_tuple_ty);
-                const any_ov_bit = if (resolved_type.zigTypeTag(zcu) == .vector)
-                    try block.addReduce(ov_bit, .Or)
-                else
-                    ov_bit;
-                const no_ov = try block.addBinOp(.cmp_eq, any_ov_bit, .zero_u1);
-
-                try sema.addSafetyCheck(block, src, no_ov, .integer_overflow);
-                return sema.tupleFieldValByIndex(block, op_ov, 0, op_ov_tuple_ty);
-            }
+        if (air_tag != air_tag_safe and zcu.backendSupportsFeature(.panic_fn)) {
+            _ = try sema.preparePanicId(src, .integer_overflow);
         }
+        return block.addBinOp(air_tag_safe, casted_lhs, casted_rhs);
     }
     return block.addBinOp(air_tag, casted_lhs, casted_rhs);
 }
diff --git a/src/Zcu.zig b/src/Zcu.zig
index 38e926298e..15e53aa202 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -3829,15 +3829,6 @@ pub const Feature = enum {
     is_named_enum_value,
     error_set_has_value,
     field_reordering,
-    /// When this feature is supported, the backend supports the following AIR instructions:
-    /// * `Air.Inst.Tag.add_safe`
-    /// * `Air.Inst.Tag.sub_safe`
-    /// * `Air.Inst.Tag.mul_safe`
-    /// * `Air.Inst.Tag.intcast_safe`
-    /// The motivation for this feature is that it makes AIR smaller, and makes it easier
-    /// to generate better machine code in the backends. All backends should migrate to
-    /// enabling this feature.
-    safety_checked_instructions,
     /// If the backend supports running from another thread.
     separate_thread,
 };
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index a67d4dcc05..f8ae173b75 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -3844,6 +3844,21 @@ pub fn nullValue(pt: Zcu.PerThread, opt_ty: Type) Allocator.Error!Value {
     } }));
 }
 
+/// `ty` is an integer or a vector of integers.
+pub fn overflowArithmeticTupleType(pt: Zcu.PerThread, ty: Type) !Type {
+    const zcu = pt.zcu;
+    const ip = &zcu.intern_pool;
+    const ov_ty: Type = if (ty.zigTypeTag(zcu) == .vector) try pt.vectorType(.{
+        .len = ty.vectorLen(zcu),
+        .child = .u1_type,
+    }) else .u1;
+    const tuple_ty = try ip.getTupleType(zcu.gpa, pt.tid, .{
+        .types = &.{ ty.toIntern(), ov_ty.toIntern() },
+        .values = &.{ .none, .none },
+    });
+    return .fromInterned(tuple_ty);
+}
+
 pub fn smallestUnsignedInt(pt: Zcu.PerThread, max: u64) Allocator.Error!Type {
     return pt.intType(.unsigned, Type.smallestUnsignedBits(max));
 }
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 981b5a800f..316b29734a 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -52,7 +52,12 @@ const Instruction = encoding.Instruction;
 const InnerError = CodeGenError || error{OutOfRegisters};
 
 pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
-    return null;
+    return comptime &.initMany(&.{
+        .expand_intcast_safe,
+        .expand_add_safe,
+        .expand_sub_safe,
+        .expand_mul_safe,
+    });
 }
 
 pt: Zcu.PerThread,
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index ec448ca29b..0704557cbb 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -32,7 +32,12 @@ const compilerRtFloatAbbrev = target_util.compilerRtFloatAbbrev;
 const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
 pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
-    return null;
+    return comptime &.initMany(&.{
+        .expand_intcast_safe,
+        .expand_add_safe,
+        .expand_sub_safe,
+        .expand_mul_safe,
+    });
 }
 
 /// Reference to the function declaration the code
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 49a32fc66c..2acee5b0e2 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -88,6 +88,10 @@ pub fn legalizeFeatures(target: *const std.Target) *const Air.Legalize.Features
 
             .unsplat_shift_rhs = false,
             .reduce_one_elem_to_bitcast = true,
+            .expand_intcast_safe = true,
+            .expand_add_safe = true,
+            .expand_sub_safe = true,
+            .expand_mul_safe = true,
         }),
     };
 }
diff --git a/src/codegen/spirv.zig b/src/codegen/spirv.zig
index 7c96909751..e23ffbf66a 100644
--- a/src/codegen/spirv.zig
+++ b/src/codegen/spirv.zig
@@ -29,7 +29,12 @@ const SpvAssembler = @import("spirv/Assembler.zig");
 const InstMap = std.AutoHashMapUnmanaged(Air.Inst.Index, IdRef);
 
 pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
-    return null;
+    return comptime &.initMany(&.{
+        .expand_intcast_safe,
+        .expand_add_safe,
+        .expand_sub_safe,
+        .expand_mul_safe,
+    });
 }
 
 pub const zig_call_abi_ver = 3;
diff --git a/src/target.zig b/src/target.zig
index c33588b1b5..4b0cc20bda 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -842,10 +842,6 @@ pub inline fn backendSupportsFeature(backend: std.builtin.CompilerBackend, compt
             .stage2_c, .stage2_llvm, .stage2_x86_64 => true,
             else => false,
         },
-        .safety_checked_instructions => switch (backend) {
-            .stage2_llvm => true,
-            else => false,
-        },
         .separate_thread => switch (backend) {
             .stage2_llvm => false,
             else => true,
-- 
cgit v1.2.3


From d9b6d1ed33d18eb13fa2cb39da3e7a381742975b Mon Sep 17 00:00:00 2001
From: Jacob Young <jacobly0@users.noreply.github.com>
Date: Fri, 30 May 2025 14:38:46 -0400
Subject: cbe: legalize safety instructions in non-zig1 builds

This is valid if the bootstrap dev env doesn't need to support runtime
safety.  Another solution can always be implemented if needs change.
---
 src/arch/riscv64/CodeGen.zig | 2 +-
 src/arch/wasm/CodeGen.zig    | 2 +-
 src/codegen/c.zig            | 8 +++++++-
 src/codegen/spirv.zig        | 2 +-
 src/dev.zig                  | 3 +++
 5 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'src/codegen')

diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 316b29734a..8a40c61cdc 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -51,7 +51,7 @@ const Instruction = encoding.Instruction;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
-pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
     return comptime &.initMany(&.{
         .expand_intcast_safe,
         .expand_add_safe,
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 0704557cbb..36908eb236 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -31,7 +31,7 @@ const libcFloatSuffix = target_util.libcFloatSuffix;
 const compilerRtFloatAbbrev = target_util.compilerRtFloatAbbrev;
 const compilerRtIntAbbrev = target_util.compilerRtIntAbbrev;
 
-pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
     return comptime &.initMany(&.{
         .expand_intcast_safe,
         .expand_add_safe,
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index e76c8e069d..8d947ce56a 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -4,6 +4,7 @@ const assert = std.debug.assert;
 const mem = std.mem;
 const log = std.log.scoped(.c);
 
+const dev = @import("../dev.zig");
 const link = @import("../link.zig");
 const Zcu = @import("../Zcu.zig");
 const Module = @import("../Package/Module.zig");
@@ -21,7 +22,12 @@ const BigIntLimb = std.math.big.Limb;
 const BigInt = std.math.big.int;
 
 pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
-    return null;
+    return if (dev.env.supports(.legalize)) comptime &.initMany(&.{
+        .expand_intcast_safe,
+        .expand_add_safe,
+        .expand_sub_safe,
+        .expand_mul_safe,
+    }) else null; // we don't currently ask zig1 to use safe optimization modes
 }
 
 pub const CType = @import("c/Type.zig");
diff --git a/src/codegen/spirv.zig b/src/codegen/spirv.zig
index e23ffbf66a..1381a79075 100644
--- a/src/codegen/spirv.zig
+++ b/src/codegen/spirv.zig
@@ -28,7 +28,7 @@ const SpvAssembler = @import("spirv/Assembler.zig");
 
 const InstMap = std.AutoHashMapUnmanaged(Air.Inst.Index, IdRef);
 
-pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
+pub fn legalizeFeatures(_: *const std.Target) *const Air.Legalize.Features {
     return comptime &.initMany(&.{
         .expand_intcast_safe,
         .expand_add_safe,
diff --git a/src/dev.zig b/src/dev.zig
index 019f18daeb..1dc8264ebc 100644
--- a/src/dev.zig
+++ b/src/dev.zig
@@ -1,5 +1,8 @@
 pub const Env = enum {
     /// zig1 features
+    /// - `-ofmt=c` only
+    /// - `-OReleaseFast` or `-OReleaseSmall` only
+    /// - no `@setRuntimeSafety(true)`
     bootstrap,
 
     /// zig2 features
-- 
cgit v1.2.3


From add2976a9ba76ec661ae5668eb2a8dca2ccfad42 Mon Sep 17 00:00:00 2001
From: mlugg <mlugg@mlugg.co.uk>
Date: Mon, 26 May 2025 05:07:13 +0100
Subject: compiler: implement better shuffle AIR

Runtime `@shuffle` has two cases which backends generally want to handle
differently for efficiency:

* One runtime vector operand; some result elements may be comptime-known
* Two runtime vector operands; some result elements may be undefined

The latter case happens if both vectors given to `@shuffle` are
runtime-known and they are both used (i.e. the mask refers to them).
Otherwise, if the result is not entirely comptime-known, we are in the
former case. `Sema` now diffentiates these two cases in the AIR so that
backends can easily handle them however they want to. Note that this
*doesn't* really involve Sema doing any more work than it would
otherwise need to, so there's not really a negative here!

Most existing backends have their lowerings for `@shuffle` migrated in
this commit. The LLVM backend uses new lowerings suggested by Jacob as
ones which it will handle effectively. The x86_64 backend has not yet
been migrated; for now there's a panic in there. Jacob will implement
that before this is merged anywhere.
---
 src/Air.zig                                        | 131 +++++++++-
 src/Air/Legalize.zig                               |   3 +-
 src/Air/Liveness.zig                               |  33 ++-
 src/Air/Liveness/Verify.zig                        |  13 +-
 src/Air/types_resolved.zig                         |  22 +-
 src/Sema.zig                                       | 268 +++++++++++----------
 src/Zcu/PerThread.zig                              |   3 +-
 src/arch/aarch64/CodeGen.zig                       |  16 +-
 src/arch/arm/CodeGen.zig                           |  15 +-
 src/arch/riscv64/CodeGen.zig                       |  15 +-
 src/arch/sparc64/CodeGen.zig                       |   3 +-
 src/arch/wasm/CodeGen.zig                          | 123 ++++++----
 src/arch/x86_64/CodeGen.zig                        |   2 +-
 src/codegen/c.zig                                  |  74 ++++--
 src/codegen/llvm.zig                               | 215 ++++++++++++++---
 src/codegen/spirv.zig                              |  74 +++---
 src/print_air.zig                                  |  40 ++-
 ...ith_selected_index_past_first_vector_length.zig |  26 +-
 18 files changed, 755 insertions(+), 321 deletions(-)

(limited to 'src/codegen')

diff --git a/src/Air.zig b/src/Air.zig
index 94e3550e79..ccfe9e9694 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -699,9 +699,21 @@ pub const Inst = struct {
         /// equal to the scalar value.
         /// Uses the `ty_op` field.
         splat,
-        /// Constructs a vector by selecting elements from `a` and `b` based on `mask`.
-        /// Uses the `ty_pl` field with payload `Shuffle`.
-        shuffle,
+        /// Constructs a vector by selecting elements from a single vector based on a mask. Each
+        /// mask element is either an index into the vector, or a comptime-known value, or "undef".
+        /// Uses the `ty_pl` field, where the payload index points to:
+        /// 1. mask_elem: ShuffleOneMask  // for each `mask_len`, which comes from `ty_pl.ty`
+        /// 2. operand: Ref               // guaranteed not to be an interned value
+        /// See `unwrapShufleOne`.
+        shuffle_one,
+        /// Constructs a vector by selecting elements from two vectors based on a mask. Each mask
+        /// element is either an index into one of the vectors, or "undef".
+        /// Uses the `ty_pl` field, where the payload index points to:
+        /// 1. mask_elem: ShuffleOneMask  // for each `mask_len`, which comes from `ty_pl.ty`
+        /// 2. operand_a: Ref             // guaranteed not to be an interned value
+        /// 3. operand_b: Ref             // guaranteed not to be an interned value
+        /// See `unwrapShufleTwo`.
+        shuffle_two,
         /// Constructs a vector element-wise from `a` or `b` based on `pred`.
         /// Uses the `pl_op` field with `pred` as operand, and payload `Bin`.
         select,
@@ -1299,13 +1311,6 @@ pub const FieldParentPtr = struct {
     field_index: u32,
 };
 
-pub const Shuffle = struct {
-    a: Inst.Ref,
-    b: Inst.Ref,
-    mask: InternPool.Index,
-    mask_len: u32,
-};
-
 pub const VectorCmp = struct {
     lhs: Inst.Ref,
     rhs: Inst.Ref,
@@ -1320,6 +1325,64 @@ pub const VectorCmp = struct {
     }
 };
 
+/// Used by `Inst.Tag.shuffle_one`. Represents a mask element which either indexes into a
+/// runtime-known vector, or is a comptime-known value.
+pub const ShuffleOneMask = packed struct(u32) {
+    index: u31,
+    kind: enum(u1) { elem, value },
+    pub fn elem(idx: u32) ShuffleOneMask {
+        return .{ .index = @intCast(idx), .kind = .elem };
+    }
+    pub fn value(val: Value) ShuffleOneMask {
+        return .{ .index = @intCast(@intFromEnum(val.toIntern())), .kind = .value };
+    }
+    pub const Unwrapped = union(enum) {
+        /// The resulting element is this index into the runtime vector.
+        elem: u32,
+        /// The resulting element is this comptime-known value.
+        /// It is correctly typed. It might be `undefined`.
+        value: InternPool.Index,
+    };
+    pub fn unwrap(raw: ShuffleOneMask) Unwrapped {
+        return switch (raw.kind) {
+            .elem => .{ .elem = raw.index },
+            .value => .{ .value = @enumFromInt(raw.index) },
+        };
+    }
+};
+
+/// Used by `Inst.Tag.shuffle_two`. Represents a mask element which either indexes into one
+/// of two runtime-known vectors, or is undefined.
+pub const ShuffleTwoMask = enum(u32) {
+    undef = std.math.maxInt(u32),
+    _,
+    pub fn aElem(idx: u32) ShuffleTwoMask {
+        return @enumFromInt(idx << 1);
+    }
+    pub fn bElem(idx: u32) ShuffleTwoMask {
+        return @enumFromInt(idx << 1 | 1);
+    }
+    pub const Unwrapped = union(enum) {
+        /// The resulting element is this index into the first runtime vector.
+        a_elem: u32,
+        /// The resulting element is this index into the second runtime vector.
+        b_elem: u32,
+        /// The resulting element is `undefined`.
+        undef,
+    };
+    pub fn unwrap(raw: ShuffleTwoMask) Unwrapped {
+        switch (raw) {
+            .undef => return .undef,
+            _ => {},
+        }
+        const x = @intFromEnum(raw);
+        return switch (@as(u1, @truncate(x))) {
+            0 => .{ .a_elem = x >> 1 },
+            1 => .{ .b_elem = x >> 1 },
+        };
+    }
+};
+
 /// Trailing:
 /// 0. `Inst.Ref` for every outputs_len
 /// 1. `Inst.Ref` for every inputs_len
@@ -1503,7 +1566,6 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
         .cmpxchg_weak,
         .cmpxchg_strong,
         .slice,
-        .shuffle,
         .aggregate_init,
         .union_init,
         .field_parent_ptr,
@@ -1517,6 +1579,8 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
         .ptr_sub,
         .try_ptr,
         .try_ptr_cold,
+        .shuffle_one,
+        .shuffle_two,
         => return datas[@intFromEnum(inst)].ty_pl.ty.toType(),
 
         .not,
@@ -1903,7 +1967,8 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
         .reduce,
         .reduce_optimized,
         .splat,
-        .shuffle,
+        .shuffle_one,
+        .shuffle_two,
         .select,
         .is_named_enum_value,
         .tag_name,
@@ -2030,6 +2095,48 @@ pub fn unwrapSwitch(air: *const Air, switch_inst: Inst.Index) UnwrappedSwitch {
     };
 }
 
+pub fn unwrapShuffleOne(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index) struct {
+    result_ty: Type,
+    operand: Inst.Ref,
+    mask: []const ShuffleOneMask,
+} {
+    const inst = air.instructions.get(@intFromEnum(inst_index));
+    switch (inst.tag) {
+        .shuffle_one => {},
+        else => unreachable, // assertion failure
+    }
+    const result_ty: Type = .fromInterned(inst.data.ty_pl.ty.toInterned().?);
+    const mask_len: u32 = result_ty.vectorLen(zcu);
+    const extra_idx = inst.data.ty_pl.payload;
+    return .{
+        .result_ty = result_ty,
+        .operand = @enumFromInt(air.extra.items[extra_idx + mask_len]),
+        .mask = @ptrCast(air.extra.items[extra_idx..][0..mask_len]),
+    };
+}
+
+pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index) struct {
+    result_ty: Type,
+    operand_a: Inst.Ref,
+    operand_b: Inst.Ref,
+    mask: []const ShuffleTwoMask,
+} {
+    const inst = air.instructions.get(@intFromEnum(inst_index));
+    switch (inst.tag) {
+        .shuffle_two => {},
+        else => unreachable, // assertion failure
+    }
+    const result_ty: Type = .fromInterned(inst.data.ty_pl.ty.toInterned().?);
+    const mask_len: u32 = result_ty.vectorLen(zcu);
+    const extra_idx = inst.data.ty_pl.payload;
+    return .{
+        .result_ty = result_ty,
+        .operand_a = @enumFromInt(air.extra.items[extra_idx + mask_len + 0]),
+        .operand_b = @enumFromInt(air.extra.items[extra_idx + mask_len + 1]),
+        .mask = @ptrCast(air.extra.items[extra_idx..][0..mask_len]),
+    };
+}
+
 pub const typesFullyResolved = types_resolved.typesFullyResolved;
 pub const typeFullyResolved = types_resolved.checkType;
 pub const valFullyResolved = types_resolved.checkVal;
diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig
index 85db181bd1..b71725995a 100644
--- a/src/Air/Legalize.zig
+++ b/src/Air/Legalize.zig
@@ -521,7 +521,8 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                 }
             },
             .splat,
-            .shuffle,
+            .shuffle_one,
+            .shuffle_two,
             => {},
             .select,
             => if (l.features.contains(.scalarize_select)) continue :inst try l.scalarize(inst, .select_pl_op_bin),
diff --git a/src/Air/Liveness.zig b/src/Air/Liveness.zig
index 34ecde26e2..7acba48ed0 100644
--- a/src/Air/Liveness.zig
+++ b/src/Air/Liveness.zig
@@ -15,6 +15,7 @@ const Liveness = @This();
 const trace = @import("../tracy.zig").trace;
 const Air = @import("../Air.zig");
 const InternPool = @import("../InternPool.zig");
+const Zcu = @import("../Zcu.zig");
 
 pub const Verify = @import("Liveness/Verify.zig");
 
@@ -136,12 +137,15 @@ fn LivenessPassData(comptime pass: LivenessPass) type {
     };
 }
 
-pub fn analyze(gpa: Allocator, air: Air, intern_pool: *InternPool) Allocator.Error!Liveness {
+pub fn analyze(zcu: *Zcu, air: Air, intern_pool: *InternPool) Allocator.Error!Liveness {
     const tracy = trace(@src());
     defer tracy.end();
 
+    const gpa = zcu.gpa;
+
     var a: Analysis = .{
         .gpa = gpa,
+        .zcu = zcu,
         .air = air,
         .tomb_bits = try gpa.alloc(
             usize,
@@ -220,6 +224,7 @@ const OperandCategory = enum {
 pub fn categorizeOperand(
     l: Liveness,
     air: Air,
+    zcu: *Zcu,
     inst: Air.Inst.Index,
     operand: Air.Inst.Index,
     ip: *const InternPool,
@@ -511,10 +516,15 @@ pub fn categorizeOperand(
             if (extra.rhs == operand_ref) return matchOperandSmallIndex(l, inst, 2, .none);
             return .none;
         },
-        .shuffle => {
-            const extra = air.extraData(Air.Shuffle, air_datas[@intFromEnum(inst)].ty_pl.payload).data;
-            if (extra.a == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
-            if (extra.b == operand_ref) return matchOperandSmallIndex(l, inst, 1, .none);
+        .shuffle_one => {
+            const unwrapped = air.unwrapShuffleOne(zcu, inst);
+            if (unwrapped.operand == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
+            return .none;
+        },
+        .shuffle_two => {
+            const unwrapped = air.unwrapShuffleTwo(zcu, inst);
+            if (unwrapped.operand_a == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
+            if (unwrapped.operand_b == operand_ref) return matchOperandSmallIndex(l, inst, 1, .none);
             return .none;
         },
         .reduce, .reduce_optimized => {
@@ -639,7 +649,7 @@ pub fn categorizeOperand(
 
                 var operand_live: bool = true;
                 for (&[_]Air.Inst.Index{ then_body[0], else_body[0] }) |cond_inst| {
-                    if (l.categorizeOperand(air, cond_inst, operand, ip) == .tomb)
+                    if (l.categorizeOperand(air, zcu, cond_inst, operand, ip) == .tomb)
                         operand_live = false;
 
                     switch (air_tags[@intFromEnum(cond_inst)]) {
@@ -824,6 +834,7 @@ pub const BigTomb = struct {
 /// In-progress data; on successful analysis converted into `Liveness`.
 const Analysis = struct {
     gpa: Allocator,
+    zcu: *Zcu,
     air: Air,
     intern_pool: *InternPool,
     tomb_bits: []usize,
@@ -1119,9 +1130,13 @@ fn analyzeInst(
             const extra = a.air.extraData(Air.Bin, pl_op.payload).data;
             return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, extra.lhs, extra.rhs });
         },
-        .shuffle => {
-            const extra = a.air.extraData(Air.Shuffle, inst_datas[@intFromEnum(inst)].ty_pl.payload).data;
-            return analyzeOperands(a, pass, data, inst, .{ extra.a, extra.b, .none });
+        .shuffle_one => {
+            const unwrapped = a.air.unwrapShuffleOne(a.zcu, inst);
+            return analyzeOperands(a, pass, data, inst, .{ unwrapped.operand, .none, .none });
+        },
+        .shuffle_two => {
+            const unwrapped = a.air.unwrapShuffleTwo(a.zcu, inst);
+            return analyzeOperands(a, pass, data, inst, .{ unwrapped.operand_a, unwrapped.operand_b, .none });
         },
         .reduce, .reduce_optimized => {
             const reduce = inst_datas[@intFromEnum(inst)].reduce;
diff --git a/src/Air/Liveness/Verify.zig b/src/Air/Liveness/Verify.zig
index e7ed37956d..4ad24cf924 100644
--- a/src/Air/Liveness/Verify.zig
+++ b/src/Air/Liveness/Verify.zig
@@ -1,6 +1,7 @@
 //! Verifies that Liveness information is valid.
 
 gpa: std.mem.Allocator,
+zcu: *Zcu,
 air: Air,
 liveness: Liveness,
 live: LiveMap = .{},
@@ -287,10 +288,13 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void {
                 const extra = self.air.extraData(Air.Bin, ty_pl.payload).data;
                 try self.verifyInstOperands(inst, .{ extra.lhs, extra.rhs, .none });
             },
-            .shuffle => {
-                const ty_pl = data[@intFromEnum(inst)].ty_pl;
-                const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-                try self.verifyInstOperands(inst, .{ extra.a, extra.b, .none });
+            .shuffle_one => {
+                const unwrapped = self.air.unwrapShuffleOne(self.zcu, inst);
+                try self.verifyInstOperands(inst, .{ unwrapped.operand, .none, .none });
+            },
+            .shuffle_two => {
+                const unwrapped = self.air.unwrapShuffleTwo(self.zcu, inst);
+                try self.verifyInstOperands(inst, .{ unwrapped.operand_a, unwrapped.operand_b, .none });
             },
             .cmp_vector,
             .cmp_vector_optimized,
@@ -639,4 +643,5 @@ const log = std.log.scoped(.liveness_verify);
 const Air = @import("../../Air.zig");
 const Liveness = @import("../Liveness.zig");
 const InternPool = @import("../../InternPool.zig");
+const Zcu = @import("../../Zcu.zig");
 const Verify = @This();
diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig
index 873f70ec50..eb17402ebe 100644
--- a/src/Air/types_resolved.zig
+++ b/src/Air/types_resolved.zig
@@ -249,12 +249,22 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
                 if (!checkRef(extra.struct_operand, zcu)) return false;
             },
 
-            .shuffle => {
-                const extra = air.extraData(Air.Shuffle, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.a, zcu)) return false;
-                if (!checkRef(extra.b, zcu)) return false;
-                if (!checkVal(Value.fromInterned(extra.mask), zcu)) return false;
+            .shuffle_one => {
+                const unwrapped = air.unwrapShuffleOne(zcu, inst);
+                if (!checkType(unwrapped.result_ty, zcu)) return false;
+                if (!checkRef(unwrapped.operand, zcu)) return false;
+                for (unwrapped.mask) |m| switch (m.unwrap()) {
+                    .elem => {},
+                    .value => |val| if (!checkVal(.fromInterned(val), zcu)) return false,
+                };
+            },
+
+            .shuffle_two => {
+                const unwrapped = air.unwrapShuffleTwo(zcu, inst);
+                if (!checkType(unwrapped.result_ty, zcu)) return false;
+                if (!checkRef(unwrapped.operand_a, zcu)) return false;
+                if (!checkRef(unwrapped.operand_b, zcu)) return false;
+                // No values to check because there are no comptime-known values other than undef
             },
 
             .cmpxchg_weak,
diff --git a/src/Sema.zig b/src/Sema.zig
index 34fb468716..3c4fc555cb 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -24256,8 +24256,8 @@ fn analyzeShuffle(
     block: *Block,
     src_node: std.zig.Ast.Node.Offset,
     elem_ty: Type,
-    a_arg: Air.Inst.Ref,
-    b_arg: Air.Inst.Ref,
+    a_uncoerced: Air.Inst.Ref,
+    b_uncoerced: Air.Inst.Ref,
     mask: Value,
     mask_len: u32,
 ) CompileError!Air.Inst.Ref {
@@ -24266,150 +24266,154 @@ fn analyzeShuffle(
     const a_src = block.builtinCallArgSrc(src_node, 1);
     const b_src = block.builtinCallArgSrc(src_node, 2);
     const mask_src = block.builtinCallArgSrc(src_node, 3);
-    var a = a_arg;
-    var b = b_arg;
 
-    const res_ty = try pt.vectorType(.{
-        .len = mask_len,
-        .child = elem_ty.toIntern(),
-    });
-
-    const maybe_a_len = switch (sema.typeOf(a).zigTypeTag(zcu)) {
-        .array, .vector => sema.typeOf(a).arrayLen(zcu),
-        .undefined => null,
-        else => return sema.fail(block, a_src, "expected vector or array with element type '{}', found '{}'", .{
-            elem_ty.fmt(pt),
-            sema.typeOf(a).fmt(pt),
-        }),
-    };
-    const maybe_b_len = switch (sema.typeOf(b).zigTypeTag(zcu)) {
-        .array, .vector => sema.typeOf(b).arrayLen(zcu),
-        .undefined => null,
-        else => return sema.fail(block, b_src, "expected vector or array with element type '{}', found '{}'", .{
-            elem_ty.fmt(pt),
-            sema.typeOf(b).fmt(pt),
-        }),
-    };
-    if (maybe_a_len == null and maybe_b_len == null) {
-        return pt.undefRef(res_ty);
-    }
-    const a_len: u32 = @intCast(maybe_a_len orelse maybe_b_len.?);
-    const b_len: u32 = @intCast(maybe_b_len orelse a_len);
-
-    const a_ty = try pt.vectorType(.{
-        .len = a_len,
-        .child = elem_ty.toIntern(),
-    });
-    const b_ty = try pt.vectorType(.{
-        .len = b_len,
-        .child = elem_ty.toIntern(),
-    });
-
-    if (maybe_a_len == null) a = try pt.undefRef(a_ty) else a = try sema.coerce(block, a_ty, a, a_src);
-    if (maybe_b_len == null) b = try pt.undefRef(b_ty) else b = try sema.coerce(block, b_ty, b, b_src);
-
-    const operand_info = [2]std.meta.Tuple(&.{ u64, LazySrcLoc, Type }){
-        .{ a_len, a_src, a_ty },
-        .{ b_len, b_src, b_ty },
-    };
-
-    for (0..@intCast(mask_len)) |i| {
-        const elem = try mask.elemValue(pt, i);
-        if (elem.isUndef(zcu)) continue;
-        const elem_resolved = try sema.resolveLazyValue(elem);
-        const int = elem_resolved.toSignedInt(zcu);
-        var unsigned: u32 = undefined;
-        var chosen: u32 = undefined;
-        if (int >= 0) {
-            unsigned = @intCast(int);
-            chosen = 0;
-        } else {
-            unsigned = @intCast(~int);
-            chosen = 1;
+    // If the type of `a` is `@Type(.undefined)`, i.e. the argument is untyped, this is 0, because it is an error to index into this vector.
+    const a_len: u32 = switch (sema.typeOf(a_uncoerced).zigTypeTag(zcu)) {
+        .array, .vector => @intCast(sema.typeOf(a_uncoerced).arrayLen(zcu)),
+        .undefined => 0,
+        else => return sema.fail(block, a_src, "expected vector of '{}', found '{}'", .{ elem_ty.fmt(pt), sema.typeOf(a_uncoerced).fmt(pt) }),
+    };
+    const a_ty = try pt.vectorType(.{ .len = a_len, .child = elem_ty.toIntern() });
+    const a_coerced = try sema.coerce(block, a_ty, a_uncoerced, a_src);
+
+    // If the type of `b` is `@Type(.undefined)`, i.e. the argument is untyped, this is 0, because it is an error to index into this vector.
+    const b_len: u32 = switch (sema.typeOf(b_uncoerced).zigTypeTag(zcu)) {
+        .array, .vector => @intCast(sema.typeOf(b_uncoerced).arrayLen(zcu)),
+        .undefined => 0,
+        else => return sema.fail(block, b_src, "expected vector of '{}', found '{}'", .{ elem_ty.fmt(pt), sema.typeOf(b_uncoerced).fmt(pt) }),
+    };
+    const b_ty = try pt.vectorType(.{ .len = b_len, .child = elem_ty.toIntern() });
+    const b_coerced = try sema.coerce(block, b_ty, b_uncoerced, b_src);
+
+    const result_ty = try pt.vectorType(.{ .len = mask_len, .child = elem_ty.toIntern() });
+
+    // We're going to pre-emptively reserve space in `sema.air_extra`. The reason for this is we need
+    // a `u32` buffer of length `mask_len` anyway, and putting it in `sema.air_extra` avoids a copy
+    // in the runtime case. If the result is comptime-known, we'll shrink `air_extra` back.
+    const air_extra_idx: u32 = @intCast(sema.air_extra.items.len);
+    const air_mask_buf = try sema.air_extra.addManyAsSlice(sema.gpa, mask_len);
+
+    // We want to interpret that buffer in `air_extra` in a few ways. Initially, we'll consider its
+    // elements as `Air.Inst.ShuffleTwoMask`, essentially representing the raw mask values; then, we'll
+    // convert it to `InternPool.Index` or `Air.Inst.ShuffleOneMask` if there are comptime-known operands.
+    const mask_ip_index: []InternPool.Index = @ptrCast(air_mask_buf);
+    const mask_shuffle_one: []Air.ShuffleOneMask = @ptrCast(air_mask_buf);
+    const mask_shuffle_two: []Air.ShuffleTwoMask = @ptrCast(air_mask_buf);
+
+    // Initial loop: check mask elements, populate `mask_shuffle_two`.
+    var a_used = false;
+    var b_used = false;
+    for (mask_shuffle_two, 0..mask_len) |*out, mask_idx| {
+        const mask_val = try mask.elemValue(pt, mask_idx);
+        if (mask_val.isUndef(zcu)) {
+            out.* = .undef;
+            continue;
         }
-        if (unsigned >= operand_info[chosen][0]) {
-            const msg = msg: {
-                const msg = try sema.errMsg(mask_src, "mask index '{d}' has out-of-bounds selection", .{i});
+        // Safe because mask elements are `i32` and we already checked for undef:
+        const raw = (try sema.resolveLazyValue(mask_val)).toSignedInt(zcu);
+        if (raw >= 0) {
+            const idx: u32 = @intCast(raw);
+            a_used = true;
+            out.* = .aElem(idx);
+            if (idx >= a_len) return sema.failWithOwnedErrorMsg(block, msg: {
+                const msg = try sema.errMsg(mask_src, "mask element at index '{d}' selects out-of-bounds index", .{mask_idx});
                 errdefer msg.destroy(sema.gpa);
-
-                try sema.errNote(operand_info[chosen][1], msg, "selected index '{d}' out of bounds of '{}'", .{
-                    unsigned,
-                    operand_info[chosen][2].fmt(pt),
-                });
-
-                if (chosen == 0) {
-                    try sema.errNote(b_src, msg, "selections from the second vector are specified with negative numbers", .{});
+                try sema.errNote(a_src, msg, "index '{d}' exceeds bounds of '{}' given here", .{ idx, a_ty.fmt(pt) });
+                if (idx < b_len) {
+                    try sema.errNote(b_src, msg, "use '~@as(u32, {d})' to index into second vector given here", .{idx});
                 }
-
                 break :msg msg;
-            };
-            return sema.failWithOwnedErrorMsg(block, msg);
+            });
+        } else {
+            const idx: u32 = @intCast(~raw);
+            b_used = true;
+            out.* = .bElem(idx);
+            if (idx >= b_len) return sema.failWithOwnedErrorMsg(block, msg: {
+                const msg = try sema.errMsg(mask_src, "mask element at index '{d}' selects out-of-bounds index", .{mask_idx});
+                errdefer msg.destroy(sema.gpa);
+                try sema.errNote(b_src, msg, "index '{d}' exceeds bounds of '{}' given here", .{ idx, b_ty.fmt(pt) });
+                break :msg msg;
+            });
         }
     }
 
-    if (try sema.resolveValue(a)) |a_val| {
-        if (try sema.resolveValue(b)) |b_val| {
-            const values = try sema.arena.alloc(InternPool.Index, mask_len);
-            for (values, 0..) |*value, i| {
-                const mask_elem_val = try mask.elemValue(pt, i);
-                if (mask_elem_val.isUndef(zcu)) {
-                    value.* = try pt.intern(.{ .undef = elem_ty.toIntern() });
-                    continue;
-                }
-                const int = mask_elem_val.toSignedInt(zcu);
-                const unsigned: u32 = @intCast(if (int >= 0) int else ~int);
-                values[i] = (try (if (int >= 0) a_val else b_val).elemValue(pt, unsigned)).toIntern();
-            }
-            return Air.internedToRef((try pt.intern(.{ .aggregate = .{
-                .ty = res_ty.toIntern(),
-                .storage = .{ .elems = values },
-            } })));
-        }
-    }
+    const maybe_a_val = try sema.resolveValue(a_coerced);
+    const maybe_b_val = try sema.resolveValue(b_coerced);
 
-    // All static analysis passed, and not comptime.
-    // For runtime codegen, vectors a and b must be the same length. Here we
-    // recursively @shuffle the smaller vector to append undefined elements
-    // to it up to the length of the longer vector. This recursion terminates
-    // in 1 call because these calls to analyzeShuffle guarantee a_len == b_len.
-    if (a_len != b_len) {
-        const min_len = @min(a_len, b_len);
-        const max_src = if (a_len > b_len) a_src else b_src;
-        const max_len = try sema.usizeCast(block, max_src, @max(a_len, b_len));
+    const a_rt = a_used and maybe_a_val == null;
+    const b_rt = b_used and maybe_b_val == null;
 
-        const expand_mask_values = try sema.arena.alloc(InternPool.Index, max_len);
-        for (@intCast(0)..@intCast(min_len)) |i| {
-            expand_mask_values[i] = (try pt.intValue(.comptime_int, i)).toIntern();
+    if (a_rt and b_rt) {
+        // Both operands are needed and runtime-known. We need a `[]ShuffleTwomask`... which is
+        // exactly what we already have in `mask_shuffle_two`! So, we're basically done already.
+        // We just need to append the two operands.
+        try sema.air_extra.ensureUnusedCapacity(sema.gpa, 2);
+        sema.appendRefsAssumeCapacity(&.{ a_coerced, b_coerced });
+        return block.addInst(.{
+            .tag = .shuffle_two,
+            .data = .{ .ty_pl = .{
+                .ty = Air.internedToRef(result_ty.toIntern()),
+                .payload = air_extra_idx,
+            } },
+        });
+    } else if (a_rt) {
+        // We need to convert the `ShuffleTwoMask` values to `ShuffleOneMask`.
+        for (mask_shuffle_two, mask_shuffle_one) |in, *out| {
+            out.* = switch (in.unwrap()) {
+                .undef => .value(try pt.undefValue(elem_ty)),
+                .a_elem => |idx| .elem(idx),
+                .b_elem => |idx| .value(try maybe_b_val.?.elemValue(pt, idx)),
+            };
         }
-        for (@intCast(min_len)..@intCast(max_len)) |i| {
-            expand_mask_values[i] = .negative_one;
+        // Now just append our single runtime operand, and we're done.
+        try sema.air_extra.ensureUnusedCapacity(sema.gpa, 1);
+        sema.appendRefsAssumeCapacity(&.{a_coerced});
+        return block.addInst(.{
+            .tag = .shuffle_one,
+            .data = .{ .ty_pl = .{
+                .ty = Air.internedToRef(result_ty.toIntern()),
+                .payload = air_extra_idx,
+            } },
+        });
+    } else if (b_rt) {
+        // We need to convert the `ShuffleTwoMask` values to `ShuffleOneMask`.
+        for (mask_shuffle_two, mask_shuffle_one) |in, *out| {
+            out.* = switch (in.unwrap()) {
+                .undef => .value(try pt.undefValue(elem_ty)),
+                .a_elem => |idx| .value(try maybe_a_val.?.elemValue(pt, idx)),
+                .b_elem => |idx| .elem(idx),
+            };
         }
-        const expand_mask = try pt.intern(.{ .aggregate = .{
-            .ty = (try pt.vectorType(.{ .len = @intCast(max_len), .child = .comptime_int_type })).toIntern(),
-            .storage = .{ .elems = expand_mask_values },
-        } });
-
-        if (a_len < b_len) {
-            const undef = try pt.undefRef(a_ty);
-            a = try sema.analyzeShuffle(block, src_node, elem_ty, a, undef, Value.fromInterned(expand_mask), @intCast(max_len));
-        } else {
-            const undef = try pt.undefRef(b_ty);
-            b = try sema.analyzeShuffle(block, src_node, elem_ty, b, undef, Value.fromInterned(expand_mask), @intCast(max_len));
+        // Now just append our single runtime operand, and we're done.
+        try sema.air_extra.ensureUnusedCapacity(sema.gpa, 1);
+        sema.appendRefsAssumeCapacity(&.{b_coerced});
+        return block.addInst(.{
+            .tag = .shuffle_one,
+            .data = .{ .ty_pl = .{
+                .ty = Air.internedToRef(result_ty.toIntern()),
+                .payload = air_extra_idx,
+            } },
+        });
+    } else {
+        // The result will be comptime-known. We must convert the `ShuffleTwoMask` values to
+        // `InternPool.Index` values using the known operands.
+        for (mask_shuffle_two, mask_ip_index) |in, *out| {
+            const val: Value = switch (in.unwrap()) {
+                .undef => try pt.undefValue(elem_ty),
+                .a_elem => |idx| try maybe_a_val.?.elemValue(pt, idx),
+                .b_elem => |idx| try maybe_b_val.?.elemValue(pt, idx),
+            };
+            out.* = val.toIntern();
         }
+        const res = try pt.intern(.{ .aggregate = .{
+            .ty = result_ty.toIntern(),
+            .storage = .{ .elems = mask_ip_index },
+        } });
+        // We have a comptime-known result, so didn't need `air_mask_buf` -- remove it from `sema.air_extra`.
+        assert(sema.air_extra.items.len == air_extra_idx + air_mask_buf.len);
+        sema.air_extra.shrinkRetainingCapacity(air_extra_idx);
+        return Air.internedToRef(res);
     }
-
-    return block.addInst(.{
-        .tag = .shuffle,
-        .data = .{ .ty_pl = .{
-            .ty = Air.internedToRef(res_ty.toIntern()),
-            .payload = try block.sema.addExtra(Air.Shuffle{
-                .a = a,
-                .b = b,
-                .mask = mask.toIntern(),
-                .mask_len = mask_len,
-            }),
-        } },
-    });
 }
 
 fn zirSelect(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData) CompileError!Air.Inst.Ref {
diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig
index f8ae173b75..8e3d07627f 100644
--- a/src/Zcu/PerThread.zig
+++ b/src/Zcu/PerThread.zig
@@ -1745,7 +1745,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A
         try air.legalize(pt, @import("../codegen.zig").legalizeFeatures(pt, nav_index) orelse break :legalize);
     }
 
-    var liveness = try Air.Liveness.analyze(gpa, air.*, ip);
+    var liveness = try Air.Liveness.analyze(zcu, air.*, ip);
     defer liveness.deinit(gpa);
 
     if (build_options.enable_debug_extensions and comp.verbose_air) {
@@ -1757,6 +1757,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A
     if (std.debug.runtime_safety) {
         var verify: Air.Liveness.Verify = .{
             .gpa = gpa,
+            .zcu = zcu,
             .air = air.*,
             .liveness = liveness,
             .intern_pool = ip,
diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig
index e9e7159938..c01fa24ecc 100644
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@@ -778,7 +778,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try self.airErrorName(inst),
             .splat           => try self.airSplat(inst),
             .select          => try self.airSelect(inst),
-            .shuffle         => try self.airShuffle(inst),
+            .shuffle_one     => try self.airShuffleOne(inst),
+            .shuffle_two     => try self.airShuffleTwo(inst),
             .reduce          => try self.airReduce(inst),
             .aggregate_init  => try self.airAggregateInit(inst),
             .union_init      => try self.airUnionInit(inst),
@@ -6049,11 +6050,14 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) InnerError!void {
     return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
-fn airShuffle(self: *Self, inst: Air.Inst.Index) InnerError!void {
-    const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airShuffle for {}", .{self.target.cpu.arch});
-    return self.finishAir(inst, result, .{ extra.a, extra.b, .none });
+fn airShuffleOne(self: *Self, inst: Air.Inst.Index) InnerError!void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleOne for {}", .{self.target.cpu.arch});
+}
+
+fn airShuffleTwo(self: *Self, inst: Air.Inst.Index) InnerError!void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleTwo for {}", .{self.target.cpu.arch});
 }
 
 fn airReduce(self: *Self, inst: Air.Inst.Index) InnerError!void {
diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig
index 8cc1d0a607..d687c74c15 100644
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@@ -767,7 +767,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try self.airErrorName(inst),
             .splat           => try self.airSplat(inst),
             .select          => try self.airSelect(inst),
-            .shuffle         => try self.airShuffle(inst),
+            .shuffle_one     => try self.airShuffleOne(inst),
+            .shuffle_two     => try self.airShuffleTwo(inst),
             .reduce          => try self.airReduce(inst),
             .aggregate_init  => try self.airAggregateInit(inst),
             .union_init      => try self.airUnionInit(inst),
@@ -6021,10 +6022,14 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
-fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
-    const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airShuffle for arm", .{});
-    return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+fn airShuffleOne(self: *Self, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleOne for arm", .{});
+}
+
+fn airShuffleTwo(self: *Self, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleTwo for arm", .{});
 }
 
 fn airReduce(self: *Self, inst: Air.Inst.Index) !void {
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 8a40c61cdc..1d17d34189 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -1586,7 +1586,8 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try func.airErrorName(inst),
             .splat           => try func.airSplat(inst),
             .select          => try func.airSelect(inst),
-            .shuffle         => try func.airShuffle(inst),
+            .shuffle_one     => try func.airShuffleOne(inst),
+            .shuffle_two     => try func.airShuffleTwo(inst),
             .reduce          => try func.airReduce(inst),
             .aggregate_init  => try func.airAggregateInit(inst),
             .union_init      => try func.airUnionInit(inst),
@@ -8030,10 +8031,14 @@ fn airSelect(func: *Func, inst: Air.Inst.Index) !void {
     return func.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
-fn airShuffle(func: *Func, inst: Air.Inst.Index) !void {
-    const ty_op = func.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = if (func.liveness.isUnused(inst)) .unreach else return func.fail("TODO implement airShuffle for riscv64", .{});
-    return func.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+fn airShuffleOne(func: *Func, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return func.fail("TODO implement airShuffleOne for riscv64", .{});
+}
+
+fn airShuffleTwo(func: *Func, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return func.fail("TODO implement airShuffleTwo for riscv64", .{});
 }
 
 fn airReduce(func: *Func, inst: Air.Inst.Index) !void {
diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig
index d473222288..439e5e6dbb 100644
--- a/src/arch/sparc64/CodeGen.zig
+++ b/src/arch/sparc64/CodeGen.zig
@@ -621,7 +621,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try self.airErrorName(inst),
             .splat           => try self.airSplat(inst),
             .select          => @panic("TODO try self.airSelect(inst)"),
-            .shuffle         => @panic("TODO try self.airShuffle(inst)"),
+            .shuffle_one     => @panic("TODO try self.airShuffleOne(inst)"),
+            .shuffle_two     => @panic("TODO try self.airShuffleTwo(inst)"),
             .reduce          => @panic("TODO try self.airReduce(inst)"),
             .aggregate_init  => try self.airAggregateInit(inst),
             .union_init      => try self.airUnionInit(inst),
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 36908eb236..ebc46179c3 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -2004,7 +2004,8 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
         .ret_load => cg.airRetLoad(inst),
         .splat => cg.airSplat(inst),
         .select => cg.airSelect(inst),
-        .shuffle => cg.airShuffle(inst),
+        .shuffle_one => cg.airShuffleOne(inst),
+        .shuffle_two => cg.airShuffleTwo(inst),
         .reduce => cg.airReduce(inst),
         .aggregate_init => cg.airAggregateInit(inst),
         .union_init => cg.airUnionInit(inst),
@@ -5177,66 +5178,100 @@ fn airSelect(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     return cg.fail("TODO: Implement wasm airSelect", .{});
 }
 
-fn airShuffle(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+fn airShuffleOne(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const pt = cg.pt;
     const zcu = pt.zcu;
-    const inst_ty = cg.typeOfIndex(inst);
-    const ty_pl = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    const extra = cg.air.extraData(Air.Shuffle, ty_pl.payload).data;
-
-    const a = try cg.resolveInst(extra.a);
-    const b = try cg.resolveInst(extra.b);
-    const mask = Value.fromInterned(extra.mask);
-    const mask_len = extra.mask_len;
 
-    const child_ty = inst_ty.childType(zcu);
-    const elem_size = child_ty.abiSize(zcu);
+    const unwrapped = cg.air.unwrapShuffleOne(zcu, inst);
+    const result_ty = unwrapped.result_ty;
+    const mask = unwrapped.mask;
+    const operand = try cg.resolveInst(unwrapped.operand);
 
-    // TODO: One of them could be by ref; handle in loop
-    if (isByRef(cg.typeOf(extra.a), zcu, cg.target) or isByRef(inst_ty, zcu, cg.target)) {
-        const result = try cg.allocStack(inst_ty);
+    const elem_ty = result_ty.childType(zcu);
+    const elem_size = elem_ty.abiSize(zcu);
 
-        for (0..mask_len) |index| {
-            const value = (try mask.elemValue(pt, index)).toSignedInt(zcu);
+    // TODO: this function could have an `i8x16_shuffle` fast path like `airShuffleTwo` if we were
+    // to lower the comptime-known operands to a non-by-ref vector value.
 
-            try cg.emitWValue(result);
+    // TODO: this is incorrect if either operand or the result is *not* by-ref, which is possible.
+    // I tried to fix it, but I couldn't make much sense of how this backend handles memory.
 
-            const loaded = if (value >= 0)
-                try cg.load(a, child_ty, @as(u32, @intCast(@as(i64, @intCast(elem_size)) * value)))
-            else
-                try cg.load(b, child_ty, @as(u32, @intCast(@as(i64, @intCast(elem_size)) * ~value)));
+    const dest_alloc = try cg.allocStack(result_ty);
+    for (mask, 0..) |mask_elem, out_idx| {
+        try cg.emitWValue(dest_alloc);
+        const elem_val = switch (mask_elem.unwrap()) {
+            .elem => |idx| try cg.load(operand, elem_ty, @intCast(elem_size * idx)),
+            .value => |val| try cg.lowerConstant(.fromInterned(val), elem_ty),
+        };
+        try cg.store(.stack, elem_val, elem_ty, @intCast(dest_alloc.offset() + elem_size * out_idx));
+    }
+    return cg.finishAir(inst, dest_alloc, &.{unwrapped.operand});
+}
 
-            try cg.store(.stack, loaded, child_ty, result.stack_offset.value + @as(u32, @intCast(elem_size)) * @as(u32, @intCast(index)));
-        }
+fn airShuffleTwo(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+    const pt = cg.pt;
+    const zcu = pt.zcu;
 
-        return cg.finishAir(inst, result, &.{ extra.a, extra.b });
-    } else {
-        var operands = [_]u32{
-            @intFromEnum(std.wasm.SimdOpcode.i8x16_shuffle),
-        } ++ [1]u32{undefined} ** 4;
+    const unwrapped = cg.air.unwrapShuffleTwo(zcu, inst);
+    const result_ty = unwrapped.result_ty;
+    const mask = unwrapped.mask;
+    const operand_a = try cg.resolveInst(unwrapped.operand_a);
+    const operand_b = try cg.resolveInst(unwrapped.operand_b);
 
-        var lanes = mem.asBytes(operands[1..]);
-        for (0..@as(usize, @intCast(mask_len))) |index| {
-            const mask_elem = (try mask.elemValue(pt, index)).toSignedInt(zcu);
-            const base_index = if (mask_elem >= 0)
-                @as(u8, @intCast(@as(i64, @intCast(elem_size)) * mask_elem))
-            else
-                16 + @as(u8, @intCast(@as(i64, @intCast(elem_size)) * ~mask_elem));
+    const a_ty = cg.typeOf(unwrapped.operand_a);
+    const b_ty = cg.typeOf(unwrapped.operand_b);
+    const elem_ty = result_ty.childType(zcu);
+    const elem_size = elem_ty.abiSize(zcu);
 
-            for (0..@as(usize, @intCast(elem_size))) |byte_offset| {
-                lanes[index * @as(usize, @intCast(elem_size)) + byte_offset] = base_index + @as(u8, @intCast(byte_offset));
+    // WASM has `i8x16_shuffle`, which we can apply if the element type bit size is a multiple of 8
+    // and the input and output vectors have a bit size of 128 (and are hence not by-ref). Otherwise,
+    // we fall back to a naive loop lowering.
+    if (!isByRef(a_ty, zcu, cg.target) and
+        !isByRef(b_ty, zcu, cg.target) and
+        !isByRef(result_ty, zcu, cg.target) and
+        elem_ty.bitSize(zcu) % 8 == 0)
+    {
+        var lane_map: [16]u8 align(4) = undefined;
+        const lanes_per_elem = elem_ty.bitSize(zcu) / 8;
+        for (mask, 0..) |mask_elem, out_idx| {
+            const out_first_lane = out_idx * lanes_per_elem;
+            const in_first_lane = switch (mask_elem.unwrap()) {
+                .a_elem => |i| i * lanes_per_elem,
+                .b_elem => |i| i * lanes_per_elem + 16,
+                .undef => 0, // doesn't matter
+            };
+            for (lane_map[out_first_lane..][0..lanes_per_elem], in_first_lane..) |*out, in| {
+                out.* = @intCast(in);
             }
         }
-
-        try cg.emitWValue(a);
-        try cg.emitWValue(b);
-
+        try cg.emitWValue(operand_a);
+        try cg.emitWValue(operand_b);
         const extra_index = cg.extraLen();
-        try cg.mir_extra.appendSlice(cg.gpa, &operands);
+        try cg.mir_extra.appendSlice(cg.gpa, &.{
+            @intFromEnum(std.wasm.SimdOpcode.i8x16_shuffle),
+            @bitCast(lane_map[0..4].*),
+            @bitCast(lane_map[4..8].*),
+            @bitCast(lane_map[8..12].*),
+            @bitCast(lane_map[12..].*),
+        });
         try cg.addInst(.{ .tag = .simd_prefix, .data = .{ .payload = extra_index } });
+        return cg.finishAir(inst, .stack, &.{ unwrapped.operand_a, unwrapped.operand_b });
+    }
+
+    // TODO: this is incorrect if either operand or the result is *not* by-ref, which is possible.
+    // I tried to fix it, but I couldn't make much sense of how this backend handles memory.
 
-        return cg.finishAir(inst, .stack, &.{ extra.a, extra.b });
+    const dest_alloc = try cg.allocStack(result_ty);
+    for (mask, 0..) |mask_elem, out_idx| {
+        try cg.emitWValue(dest_alloc);
+        const elem_val = switch (mask_elem.unwrap()) {
+            .a_elem => |idx| try cg.load(operand_a, elem_ty, @intCast(elem_size * idx)),
+            .b_elem => |idx| try cg.load(operand_b, elem_ty, @intCast(elem_size * idx)),
+            .undef => try cg.emitUndefined(elem_ty),
+        };
+        try cg.store(.stack, elem_val, elem_ty, @intCast(dest_alloc.offset() + elem_size * out_idx));
     }
+    return cg.finishAir(inst, dest_alloc, &.{ unwrapped.operand_a, unwrapped.operand_b });
 }
 
 fn airReduce(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index f943b7f415..f6d8d61adc 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -2490,7 +2490,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
         switch (air_tags[@intFromEnum(inst)]) {
             // zig fmt: off
             .select           => try cg.airSelect(inst),
-            .shuffle          => try cg.airShuffle(inst),
+            .shuffle_one, .shuffle_two => @panic("x86_64 TODO: shuffle_one/shuffle_two"),
             // zig fmt: on
 
             .arg => if (cg.debug_output != .none) {
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index 8d947ce56a..c68abc06ce 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -3374,7 +3374,8 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
             .error_name       => try airErrorName(f, inst),
             .splat            => try airSplat(f, inst),
             .select           => try airSelect(f, inst),
-            .shuffle          => try airShuffle(f, inst),
+            .shuffle_one      => try airShuffleOne(f, inst),
+            .shuffle_two      => try airShuffleTwo(f, inst),
             .reduce           => try airReduce(f, inst),
             .aggregate_init   => try airAggregateInit(f, inst),
             .union_init       => try airUnionInit(f, inst),
@@ -7163,34 +7164,73 @@ fn airSelect(f: *Function, inst: Air.Inst.Index) !CValue {
     return local;
 }
 
-fn airShuffle(f: *Function, inst: Air.Inst.Index) !CValue {
+fn airShuffleOne(f: *Function, inst: Air.Inst.Index) !CValue {
     const pt = f.object.dg.pt;
     const zcu = pt.zcu;
-    const ty_pl = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    const extra = f.air.extraData(Air.Shuffle, ty_pl.payload).data;
-
-    const mask = Value.fromInterned(extra.mask);
-    const lhs = try f.resolveInst(extra.a);
-    const rhs = try f.resolveInst(extra.b);
 
-    const inst_ty = f.typeOfIndex(inst);
+    const unwrapped = f.air.unwrapShuffleOne(zcu, inst);
+    const mask = unwrapped.mask;
+    const operand = try f.resolveInst(unwrapped.operand);
+    const inst_ty = unwrapped.result_ty;
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
-    try reap(f, inst, &.{ extra.a, extra.b }); // local cannot alias operands
-    for (0..extra.mask_len) |index| {
+    try reap(f, inst, &.{unwrapped.operand}); // local cannot alias operand
+    for (mask, 0..) |mask_elem, out_idx| {
         try f.writeCValue(writer, local, .Other);
         try writer.writeByte('[');
-        try f.object.dg.renderValue(writer, try pt.intValue(.usize, index), .Other);
+        try f.object.dg.renderValue(writer, try pt.intValue(.usize, out_idx), .Other);
         try writer.writeAll("] = ");
+        switch (mask_elem.unwrap()) {
+            .elem => |src_idx| {
+                try f.writeCValue(writer, operand, .Other);
+                try writer.writeByte('[');
+                try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other);
+                try writer.writeByte(']');
+            },
+            .value => |val| try f.object.dg.renderValue(writer, .fromInterned(val), .Other),
+        }
+        try writer.writeAll(";\n");
+    }
 
-        const mask_elem = (try mask.elemValue(pt, index)).toSignedInt(zcu);
-        const src_val = try pt.intValue(.usize, @as(u64, @intCast(mask_elem ^ mask_elem >> 63)));
+    return local;
+}
 
-        try f.writeCValue(writer, if (mask_elem >= 0) lhs else rhs, .Other);
+fn airShuffleTwo(f: *Function, inst: Air.Inst.Index) !CValue {
+    const pt = f.object.dg.pt;
+    const zcu = pt.zcu;
+
+    const unwrapped = f.air.unwrapShuffleTwo(zcu, inst);
+    const mask = unwrapped.mask;
+    const operand_a = try f.resolveInst(unwrapped.operand_a);
+    const operand_b = try f.resolveInst(unwrapped.operand_b);
+    const inst_ty = unwrapped.result_ty;
+    const elem_ty = inst_ty.childType(zcu);
+
+    const writer = f.object.writer();
+    const local = try f.allocLocal(inst, inst_ty);
+    try reap(f, inst, &.{ unwrapped.operand_a, unwrapped.operand_b }); // local cannot alias operands
+    for (mask, 0..) |mask_elem, out_idx| {
+        try f.writeCValue(writer, local, .Other);
         try writer.writeByte('[');
-        try f.object.dg.renderValue(writer, src_val, .Other);
-        try writer.writeAll("];\n");
+        try f.object.dg.renderValue(writer, try pt.intValue(.usize, out_idx), .Other);
+        try writer.writeAll("] = ");
+        switch (mask_elem.unwrap()) {
+            .a_elem => |src_idx| {
+                try f.writeCValue(writer, operand_a, .Other);
+                try writer.writeByte('[');
+                try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other);
+                try writer.writeByte(']');
+            },
+            .b_elem => |src_idx| {
+                try f.writeCValue(writer, operand_b, .Other);
+                try writer.writeByte('[');
+                try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other);
+                try writer.writeByte(']');
+            },
+            .undef => try f.object.dg.renderUndefValue(writer, elem_ty, .Other),
+        }
+        try writer.writeAll(";\n");
     }
 
     return local;
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 77d8f3ff47..960d5f819b 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -4969,7 +4969,8 @@ pub const FuncGen = struct {
                 .error_name     => try self.airErrorName(inst),
                 .splat          => try self.airSplat(inst),
                 .select         => try self.airSelect(inst),
-                .shuffle        => try self.airShuffle(inst),
+                .shuffle_one    => try self.airShuffleOne(inst),
+                .shuffle_two    => try self.airShuffleTwo(inst),
                 .aggregate_init => try self.airAggregateInit(inst),
                 .union_init     => try self.airUnionInit(inst),
                 .prefetch       => try self.airPrefetch(inst),
@@ -9666,7 +9667,7 @@ pub const FuncGen = struct {
         const zcu = o.pt.zcu;
         const ip = &zcu.intern_pool;
         for (body_tail[1..]) |body_inst| {
-            switch (fg.liveness.categorizeOperand(fg.air, body_inst, body_tail[0], ip)) {
+            switch (fg.liveness.categorizeOperand(fg.air, zcu, body_inst, body_tail[0], ip)) {
                 .none => continue,
                 .write, .noret, .complex => return false,
                 .tomb => return true,
@@ -10421,42 +10422,192 @@ pub const FuncGen = struct {
         return self.wip.select(.normal, pred, a, b, "");
     }
 
-    fn airShuffle(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
-        const o = self.ng.object;
+    fn airShuffleOne(fg: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+        const o = fg.ng.object;
         const pt = o.pt;
         const zcu = pt.zcu;
-        const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-        const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-        const a = try self.resolveInst(extra.a);
-        const b = try self.resolveInst(extra.b);
-        const mask = Value.fromInterned(extra.mask);
-        const mask_len = extra.mask_len;
-        const a_len = self.typeOf(extra.a).vectorLen(zcu);
-
-        // LLVM uses integers larger than the length of the first array to
-        // index into the second array. This was deemed unnecessarily fragile
-        // when changing code, so Zig uses negative numbers to index the
-        // second vector. These start at -1 and go down, and are easiest to use
-        // with the ~ operator. Here we convert between the two formats.
-        const values = try self.gpa.alloc(Builder.Constant, mask_len);
-        defer self.gpa.free(values);
-
-        for (values, 0..) |*val, i| {
-            const elem = try mask.elemValue(pt, i);
-            if (elem.isUndef(zcu)) {
-                val.* = try o.builder.undefConst(.i32);
-            } else {
-                const int = elem.toSignedInt(zcu);
-                const unsigned: u32 = @intCast(if (int >= 0) int else ~int + a_len);
-                val.* = try o.builder.intConst(.i32, unsigned);
+        const gpa = zcu.gpa;
+
+        const unwrapped = fg.air.unwrapShuffleOne(zcu, inst);
+
+        const operand = try fg.resolveInst(unwrapped.operand);
+        const mask = unwrapped.mask;
+        const operand_ty = fg.typeOf(unwrapped.operand);
+        const llvm_operand_ty = try o.lowerType(operand_ty);
+        const llvm_result_ty = try o.lowerType(unwrapped.result_ty);
+        const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu));
+        const llvm_poison_elem = try o.builder.poisonConst(llvm_elem_ty);
+        const llvm_poison_mask_elem = try o.builder.poisonConst(.i32);
+        const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32);
+
+        // LLVM requires that the two input vectors have the same length, so lowering isn't trivial.
+        // And, in the words of jacobly0: "llvm sucks at shuffles so we do have to hold its hand at
+        // least a bit". So, there are two cases here.
+        //
+        // If the operand length equals the mask length, we do just the one `shufflevector`, where
+        // the second operand is a constant vector with comptime-known elements at the right indices
+        // and poison values elsewhere (in the indices which won't be selected).
+        //
+        // Otherwise, we lower to *two* `shufflevector` instructions. The first shuffles the runtime
+        // operand with an all-poison vector to extract and correctly position all of the runtime
+        // elements. We also make a constant vector with all of the comptime elements correctly
+        // positioned. Then, our second instruction selects elements from those "runtime-or-poison"
+        // and "comptime-or-poison" vectors to compute the result.
+
+        // This buffer is used primarily for the mask constants.
+        const llvm_elem_buf = try gpa.alloc(Builder.Constant, mask.len);
+        defer gpa.free(llvm_elem_buf);
+
+        // ...but first, we'll collect all of the comptime-known values.
+        var any_defined_comptime_value = false;
+        for (mask, llvm_elem_buf) |mask_elem, *llvm_elem| {
+            llvm_elem.* = switch (mask_elem.unwrap()) {
+                .elem => llvm_poison_elem,
+                .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) elem: {
+                    any_defined_comptime_value = true;
+                    break :elem try o.lowerValue(val);
+                } else llvm_poison_elem,
+            };
+        }
+        // This vector is like the result, but runtime elements are replaced with poison.
+        const comptime_and_poison: Builder.Value = if (any_defined_comptime_value) vec: {
+            break :vec try o.builder.vectorValue(llvm_result_ty, llvm_elem_buf);
+        } else try o.builder.poisonValue(llvm_result_ty);
+
+        if (operand_ty.vectorLen(zcu) == mask.len) {
+            // input length equals mask/output length, so we lower to one instruction
+            for (mask, llvm_elem_buf, 0..) |mask_elem, *llvm_elem, elem_idx| {
+                llvm_elem.* = switch (mask_elem.unwrap()) {
+                    .elem => |idx| try o.builder.intConst(.i32, idx),
+                    .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) mask_val: {
+                        break :mask_val try o.builder.intConst(.i32, mask.len + elem_idx);
+                    } else llvm_poison_mask_elem,
+                };
             }
+            return fg.wip.shuffleVector(
+                operand,
+                comptime_and_poison,
+                try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf),
+                "",
+            );
+        }
+
+        for (mask, llvm_elem_buf) |mask_elem, *llvm_elem| {
+            llvm_elem.* = switch (mask_elem.unwrap()) {
+                .elem => |idx| try o.builder.intConst(.i32, idx),
+                .value => llvm_poison_mask_elem,
+            };
+        }
+        // This vector is like our result, but all comptime-known elements are poison.
+        const runtime_and_poison = try fg.wip.shuffleVector(
+            operand,
+            try o.builder.poisonValue(llvm_operand_ty),
+            try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf),
+            "",
+        );
+
+        if (!any_defined_comptime_value) {
+            // `comptime_and_poison` is just poison; a second shuffle would be a nop.
+            return runtime_and_poison;
+        }
+
+        // In this second shuffle, the inputs, the mask, and the output all have the same length.
+        for (mask, llvm_elem_buf, 0..) |mask_elem, *llvm_elem, elem_idx| {
+            llvm_elem.* = switch (mask_elem.unwrap()) {
+                .elem => try o.builder.intConst(.i32, elem_idx),
+                .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) mask_val: {
+                    break :mask_val try o.builder.intConst(.i32, mask.len + elem_idx);
+                } else llvm_poison_mask_elem,
+            };
         }
+        // Merge the runtime and comptime elements with the mask we just built.
+        return fg.wip.shuffleVector(
+            runtime_and_poison,
+            comptime_and_poison,
+            try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf),
+            "",
+        );
+    }
+
+    fn airShuffleTwo(fg: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+        const o = fg.ng.object;
+        const pt = o.pt;
+        const zcu = pt.zcu;
+        const gpa = zcu.gpa;
+
+        const unwrapped = fg.air.unwrapShuffleTwo(zcu, inst);
+
+        const mask = unwrapped.mask;
+        const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu));
+        const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32);
+        const llvm_poison_mask_elem = try o.builder.poisonConst(.i32);
+
+        // This is kind of simpler than in `airShuffleOne`. We extend the shorter vector to the
+        // length of the longer one with an initial `shufflevector` if necessary, and then do the
+        // actual computation with a second `shufflevector`.
+
+        const operand_a_len = fg.typeOf(unwrapped.operand_a).vectorLen(zcu);
+        const operand_b_len = fg.typeOf(unwrapped.operand_b).vectorLen(zcu);
+        const operand_len: u32 = @max(operand_a_len, operand_b_len);
+
+        // If we need to extend an operand, this is the type that mask will have.
+        const llvm_operand_mask_ty = try o.builder.vectorType(.normal, operand_len, .i32);
+
+        const llvm_elem_buf = try gpa.alloc(Builder.Constant, @max(mask.len, operand_len));
+        defer gpa.free(llvm_elem_buf);
 
-        const llvm_mask_value = try o.builder.vectorValue(
-            try o.builder.vectorType(.normal, mask_len, .i32),
-            values,
+        const operand_a: Builder.Value = extend: {
+            const raw = try fg.resolveInst(unwrapped.operand_a);
+            if (operand_a_len == operand_len) break :extend raw;
+            // Extend with a `shufflevector`, with a mask `<0, 1, ..., n, poison, poison, ..., poison>`
+            const mask_elems = llvm_elem_buf[0..operand_len];
+            for (mask_elems[0..operand_a_len], 0..) |*llvm_elem, elem_idx| {
+                llvm_elem.* = try o.builder.intConst(.i32, elem_idx);
+            }
+            @memset(mask_elems[operand_a_len..], llvm_poison_mask_elem);
+            const llvm_this_operand_ty = try o.builder.vectorType(.normal, operand_a_len, llvm_elem_ty);
+            break :extend try fg.wip.shuffleVector(
+                raw,
+                try o.builder.poisonValue(llvm_this_operand_ty),
+                try o.builder.vectorValue(llvm_operand_mask_ty, mask_elems),
+                "",
+            );
+        };
+        const operand_b: Builder.Value = extend: {
+            const raw = try fg.resolveInst(unwrapped.operand_b);
+            if (operand_b_len == operand_len) break :extend raw;
+            // Extend with a `shufflevector`, with a mask `<0, 1, ..., n, poison, poison, ..., poison>`
+            const mask_elems = llvm_elem_buf[0..operand_len];
+            for (mask_elems[0..operand_b_len], 0..) |*llvm_elem, elem_idx| {
+                llvm_elem.* = try o.builder.intConst(.i32, elem_idx);
+            }
+            @memset(mask_elems[operand_b_len..], llvm_poison_mask_elem);
+            const llvm_this_operand_ty = try o.builder.vectorType(.normal, operand_b_len, llvm_elem_ty);
+            break :extend try fg.wip.shuffleVector(
+                raw,
+                try o.builder.poisonValue(llvm_this_operand_ty),
+                try o.builder.vectorValue(llvm_operand_mask_ty, mask_elems),
+                "",
+            );
+        };
+
+        // `operand_a` and `operand_b` now have the same length (we've extended the shorter one with
+        // an initial shuffle if necessary). Now for the easy bit.
+
+        const mask_elems = llvm_elem_buf[0..mask.len];
+        for (mask, mask_elems) |mask_elem, *llvm_mask_elem| {
+            llvm_mask_elem.* = switch (mask_elem.unwrap()) {
+                .a_elem => |idx| try o.builder.intConst(.i32, idx),
+                .b_elem => |idx| try o.builder.intConst(.i32, operand_len + idx),
+                .undef => llvm_poison_mask_elem,
+            };
+        }
+        return fg.wip.shuffleVector(
+            operand_a,
+            operand_b,
+            try o.builder.vectorValue(llvm_mask_ty, mask_elems),
+            "",
         );
-        return self.wip.shuffleVector(a, b, llvm_mask_value, "");
     }
 
     /// Reduce a vector by repeatedly applying `llvm_fn` to produce an accumulated result.
diff --git a/src/codegen/spirv.zig b/src/codegen/spirv.zig
index 1381a79075..f83c6979ff 100644
--- a/src/codegen/spirv.zig
+++ b/src/codegen/spirv.zig
@@ -3252,7 +3252,8 @@ const NavGen = struct {
 
             .splat => try self.airSplat(inst),
             .reduce, .reduce_optimized => try self.airReduce(inst),
-            .shuffle                   => try self.airShuffle(inst),
+            .shuffle_one               => try self.airShuffleOne(inst),
+            .shuffle_two               => try self.airShuffleTwo(inst),
 
             .ptr_add => try self.airPtrAdd(inst),
             .ptr_sub => try self.airPtrSub(inst),
@@ -4047,40 +4048,57 @@ const NavGen = struct {
         return result_id;
     }
 
-    fn airShuffle(self: *NavGen, inst: Air.Inst.Index) !?IdRef {
-        const pt = self.pt;
+    fn airShuffleOne(ng: *NavGen, inst: Air.Inst.Index) !?IdRef {
+        const pt = ng.pt;
         const zcu = pt.zcu;
-        const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-        const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-        const a = try self.resolve(extra.a);
-        const b = try self.resolve(extra.b);
-        const mask = Value.fromInterned(extra.mask);
+        const gpa = zcu.gpa;
 
-        // Note: number of components in the result, a, and b may differ.
-        const result_ty = self.typeOfIndex(inst);
-        const scalar_ty = result_ty.scalarType(zcu);
-        const scalar_ty_id = try self.resolveType(scalar_ty, .direct);
+        const unwrapped = ng.air.unwrapShuffleOne(zcu, inst);
+        const mask = unwrapped.mask;
+        const result_ty = unwrapped.result_ty;
+        const elem_ty = result_ty.childType(zcu);
+        const operand = try ng.resolve(unwrapped.operand);
 
-        const constituents = try self.gpa.alloc(IdRef, result_ty.vectorLen(zcu));
-        defer self.gpa.free(constituents);
+        const constituents = try gpa.alloc(IdRef, mask.len);
+        defer gpa.free(constituents);
 
-        for (constituents, 0..) |*id, i| {
-            const elem = try mask.elemValue(pt, i);
-            if (elem.isUndef(zcu)) {
-                id.* = try self.spv.constUndef(scalar_ty_id);
-                continue;
-            }
+        for (constituents, mask) |*id, mask_elem| {
+            id.* = switch (mask_elem.unwrap()) {
+                .elem => |idx| try ng.extractVectorComponent(elem_ty, operand, idx),
+                .value => |val| try ng.constant(elem_ty, .fromInterned(val), .direct),
+            };
+        }
 
-            const index = elem.toSignedInt(zcu);
-            if (index >= 0) {
-                id.* = try self.extractVectorComponent(scalar_ty, a, @intCast(index));
-            } else {
-                id.* = try self.extractVectorComponent(scalar_ty, b, @intCast(~index));
-            }
+        const result_ty_id = try ng.resolveType(result_ty, .direct);
+        return try ng.constructComposite(result_ty_id, constituents);
+    }
+
+    fn airShuffleTwo(ng: *NavGen, inst: Air.Inst.Index) !?IdRef {
+        const pt = ng.pt;
+        const zcu = pt.zcu;
+        const gpa = zcu.gpa;
+
+        const unwrapped = ng.air.unwrapShuffleTwo(zcu, inst);
+        const mask = unwrapped.mask;
+        const result_ty = unwrapped.result_ty;
+        const elem_ty = result_ty.childType(zcu);
+        const elem_ty_id = try ng.resolveType(elem_ty, .direct);
+        const operand_a = try ng.resolve(unwrapped.operand_a);
+        const operand_b = try ng.resolve(unwrapped.operand_b);
+
+        const constituents = try gpa.alloc(IdRef, mask.len);
+        defer gpa.free(constituents);
+
+        for (constituents, mask) |*id, mask_elem| {
+            id.* = switch (mask_elem.unwrap()) {
+                .a_elem => |idx| try ng.extractVectorComponent(elem_ty, operand_a, idx),
+                .b_elem => |idx| try ng.extractVectorComponent(elem_ty, operand_b, idx),
+                .undef => try ng.spv.constUndef(elem_ty_id),
+            };
         }
 
-        const result_ty_id = try self.resolveType(result_ty, .direct);
-        return try self.constructComposite(result_ty_id, constituents);
+        const result_ty_id = try ng.resolveType(result_ty, .direct);
+        return try ng.constructComposite(result_ty_id, constituents);
     }
 
     fn indicesToIds(self: *NavGen, indices: []const u32) ![]IdRef {
diff --git a/src/print_air.zig b/src/print_air.zig
index 0f658dcd9f..6085adbcdc 100644
--- a/src/print_air.zig
+++ b/src/print_air.zig
@@ -315,7 +315,8 @@ const Writer = struct {
             .wasm_memory_grow => try w.writeWasmMemoryGrow(s, inst),
             .mul_add => try w.writeMulAdd(s, inst),
             .select => try w.writeSelect(s, inst),
-            .shuffle => try w.writeShuffle(s, inst),
+            .shuffle_one => try w.writeShuffleOne(s, inst),
+            .shuffle_two => try w.writeShuffleTwo(s, inst),
             .reduce, .reduce_optimized => try w.writeReduce(s, inst),
             .cmp_vector, .cmp_vector_optimized => try w.writeCmpVector(s, inst),
             .vector_store_elem => try w.writeVectorStoreElem(s, inst),
@@ -499,14 +500,39 @@ const Writer = struct {
         try w.writeOperand(s, inst, 2, pl_op.operand);
     }
 
-    fn writeShuffle(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
-        const ty_pl = w.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-        const extra = w.air.extraData(Air.Shuffle, ty_pl.payload).data;
+    fn writeShuffleOne(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
+        const unwrapped = w.air.unwrapShuffleOne(w.pt.zcu, inst);
+        try w.writeType(s, unwrapped.result_ty);
+        try s.writeAll(", ");
+        try w.writeOperand(s, inst, 0, unwrapped.operand);
+        try s.writeAll(", [");
+        for (unwrapped.mask, 0..) |mask_elem, mask_idx| {
+            if (mask_idx > 0) try s.writeAll(", ");
+            switch (mask_elem.unwrap()) {
+                .elem => |idx| try s.print("elem {d}", .{idx}),
+                .value => |val| try s.print("val {}", .{Value.fromInterned(val).fmtValue(w.pt)}),
+            }
+        }
+        try s.writeByte(']');
+    }
 
-        try w.writeOperand(s, inst, 0, extra.a);
+    fn writeShuffleTwo(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
+        const unwrapped = w.air.unwrapShuffleTwo(w.pt.zcu, inst);
+        try w.writeType(s, unwrapped.result_ty);
+        try s.writeAll(", ");
+        try w.writeOperand(s, inst, 0, unwrapped.operand_a);
         try s.writeAll(", ");
-        try w.writeOperand(s, inst, 1, extra.b);
-        try s.print(", mask {d}, len {d}", .{ extra.mask, extra.mask_len });
+        try w.writeOperand(s, inst, 1, unwrapped.operand_b);
+        try s.writeAll(", [");
+        for (unwrapped.mask, 0..) |mask_elem, mask_idx| {
+            if (mask_idx > 0) try s.writeAll(", ");
+            switch (mask_elem.unwrap()) {
+                .a_elem => |idx| try s.print("a_elem {d}", .{idx}),
+                .b_elem => |idx| try s.print("b_elem {d}", .{idx}),
+                .undef => try s.writeAll("undef"),
+            }
+        }
+        try s.writeByte(']');
     }
 
     fn writeSelect(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
diff --git a/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig b/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig
index c1594d55fb..4ad01d28c4 100644
--- a/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig
+++ b/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig
@@ -1,14 +1,20 @@
-export fn entry() void {
-    const v: @Vector(4, u32) = [4]u32{ 10, 11, 12, 13 };
-    const x: @Vector(4, u32) = [4]u32{ 14, 15, 16, 17 };
-    const z = @shuffle(u32, v, x, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 });
-    _ = z;
+export fn foo() void {
+    // Here, the bad index ('7') is not less than 'b.len', so the error shouldn't have a note suggesting a negative index.
+    const a: @Vector(4, u32) = .{ 10, 11, 12, 13 };
+    const b: @Vector(4, u32) = .{ 14, 15, 16, 17 };
+    _ = @shuffle(u32, a, b, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 });
+}
+export fn bar() void {
+    // Here, the bad index ('7') *is* less than 'b.len', so the error *should* have a note suggesting a negative index.
+    const a: @Vector(4, u32) = .{ 10, 11, 12, 13 };
+    const b: @Vector(9, u32) = .{ 14, 15, 16, 17, 18, 19, 20, 21, 22 };
+    _ = @shuffle(u32, a, b, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 });
 }
 
 // error
-// backend=stage2
-// target=native
 //
-// :4:41: error: mask index '4' has out-of-bounds selection
-// :4:29: note: selected index '7' out of bounds of '@Vector(4, u32)'
-// :4:32: note: selections from the second vector are specified with negative numbers
+// :5:35: error: mask element at index '4' selects out-of-bounds index
+// :5:23: note: index '7' exceeds bounds of '@Vector(4, u32)' given here
+// :11:35: error: mask element at index '4' selects out-of-bounds index
+// :11:23: note: index '7' exceeds bounds of '@Vector(4, u32)' given here
+// :11:26: note: use '~@as(u32, 7)' to index into second vector given here
-- 
cgit v1.2.3


From c1a5caa4545264b476951e844818f2abe103f41c Mon Sep 17 00:00:00 2001
From: mlugg <mlugg@mlugg.co.uk>
Date: Sun, 1 Jun 2025 07:41:24 +0100
Subject: compiler: combine `@intCast` safety checks

`castTruncatedData` was a poorly worded error (all shrinking casts
"truncate bits", it's just that we assume those bits to be zext/sext of
the other bits!), and `negativeToUnsigned` was a pointless distinction
which forced the compiler to emit worse code (since two separate safety
checks were required for casting e.g. 'i32' to 'u16') and wasn't even
implemented correctly. This commit combines those safety panics into one
function, `integerOutOfBounds`. The name maybe isn't perfect, but that's
not hugely important; what matters is the new default message, which is
clearer than the old ones: "integer does not fit in destination type".
---
 doc/langref/test_intCast_builtin.zig                     |  2 +-
 lib/std/debug.zig                                        | 12 ++++++------
 lib/std/debug/no_panic.zig                               | 11 +++++------
 lib/std/debug/simple_panic.zig                           | 12 ++++++------
 src/Air/Legalize.zig                                     |  2 +-
 src/Sema.zig                                             | 16 ++++++----------
 src/Zcu.zig                                              | 12 ++++--------
 src/codegen/llvm.zig                                     | 10 +++-------
 test/cases/compile_errors/bad_panic_call_signature.zig   |  5 +----
 .../cases/compile_errors/bad_panic_generic_signature.zig |  5 +----
 test/cases/safety/@intCast to u0.zig                     |  2 +-
 ...ot fitting in cast to unsigned integer - widening.zig |  2 +-
 ...d integer not fitting in cast to unsigned integer.zig |  2 +-
 test/cases/safety/signed-unsigned vector cast.zig        |  2 +-
 test/cases/safety/truncating vector cast.zig             |  2 +-
 ...itting in cast to signed integer - same bit count.zig |  2 +-
 test/cases/safety/unsigned-signed vector cast.zig        |  2 +-
 .../value does not fit in shortening cast - u0.zig       |  2 +-
 .../safety/value does not fit in shortening cast.zig     |  2 +-
 test/incremental/change_panic_handler_explicit           | 15 +++------------
 20 files changed, 46 insertions(+), 74 deletions(-)

(limited to 'src/codegen')

diff --git a/doc/langref/test_intCast_builtin.zig b/doc/langref/test_intCast_builtin.zig
index cfd5b9c092..835ba48379 100644
--- a/doc/langref/test_intCast_builtin.zig
+++ b/doc/langref/test_intCast_builtin.zig
@@ -5,4 +5,4 @@ test "integer cast panic" {
     _ = b;
 }
 
-// test_error=cast truncated bits
+// test_error=integer does not fit in destination type
diff --git a/lib/std/debug.zig b/lib/std/debug.zig
index 527676566c..450c627e15 100644
--- a/lib/std/debug.zig
+++ b/lib/std/debug.zig
@@ -78,13 +78,9 @@ pub fn FullPanic(comptime panicFn: fn ([]const u8, ?usize) noreturn) type {
             @branchHint(.cold);
             call("invalid error code", @returnAddress());
         }
-        pub fn castTruncatedData() noreturn {
+        pub fn integerOutOfBounds() noreturn {
             @branchHint(.cold);
-            call("integer cast truncated bits", @returnAddress());
-        }
-        pub fn negativeToUnsigned() noreturn {
-            @branchHint(.cold);
-            call("attempt to cast negative value to unsigned integer", @returnAddress());
+            call("integer does not fit in destination type", @returnAddress());
         }
         pub fn integerOverflow() noreturn {
             @branchHint(.cold);
@@ -128,6 +124,10 @@ pub fn FullPanic(comptime panicFn: fn ([]const u8, ?usize) noreturn) type {
         }
         /// Delete after next zig1.wasm update
         pub const memcpyLenMismatch = copyLenMismatch;
+        /// Delete after next zig1.wasm update
+        pub const castTruncatedData = integerOutOfBounds;
+        /// Delete after next zig1.wasm update
+        pub const negativeToUnsigned = integerOutOfBounds;
         pub fn copyLenMismatch() noreturn {
             @branchHint(.cold);
             call("source and destination arguments have non-equal lengths", @returnAddress());
diff --git a/lib/std/debug/no_panic.zig b/lib/std/debug/no_panic.zig
index 0a4996097a..67181b116e 100644
--- a/lib/std/debug/no_panic.zig
+++ b/lib/std/debug/no_panic.zig
@@ -65,12 +65,7 @@ pub fn invalidErrorCode() noreturn {
     @trap();
 }
 
-pub fn castTruncatedData() noreturn {
-    @branchHint(.cold);
-    @trap();
-}
-
-pub fn negativeToUnsigned() noreturn {
+pub fn integerOutOfBounds() noreturn {
     @branchHint(.cold);
     @trap();
 }
@@ -127,6 +122,10 @@ pub fn forLenMismatch() noreturn {
 
 /// Delete after next zig1.wasm update
 pub const memcpyLenMismatch = copyLenMismatch;
+/// Delete after next zig1.wasm update
+pub const castTruncatedData = integerOutOfBounds;
+/// Delete after next zig1.wasm update
+pub const negativeToUnsigned = integerOutOfBounds;
 
 pub fn copyLenMismatch() noreturn {
     @branchHint(.cold);
diff --git a/lib/std/debug/simple_panic.zig b/lib/std/debug/simple_panic.zig
index 568f7de495..61a3d5d76f 100644
--- a/lib/std/debug/simple_panic.zig
+++ b/lib/std/debug/simple_panic.zig
@@ -72,12 +72,8 @@ pub fn invalidErrorCode() noreturn {
     call("invalid error code", null);
 }
 
-pub fn castTruncatedData() noreturn {
-    call("integer cast truncated bits", null);
-}
-
-pub fn negativeToUnsigned() noreturn {
-    call("attempt to cast negative value to unsigned integer", null);
+pub fn integerOutOfBounds() noreturn {
+    call("integer does not fit in destination type", null);
 }
 
 pub fn integerOverflow() noreturn {
@@ -122,6 +118,10 @@ pub fn forLenMismatch() noreturn {
 
 /// Delete after next zig1.wasm update
 pub const memcpyLenMismatch = copyLenMismatch;
+/// Delete after next zig1.wasm update
+pub const castTruncatedData = integerOutOfBounds;
+/// Delete after next zig1.wasm update
+pub const negativeToUnsigned = integerOutOfBounds;
 
 pub fn copyLenMismatch() noreturn {
     call("source and destination have non-equal lengths", null);
diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig
index 220ff54924..8f36ba21a7 100644
--- a/src/Air/Legalize.zig
+++ b/src/Air/Legalize.zig
@@ -1307,7 +1307,7 @@ fn safeIntcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.In
     var main_block: Block = .init(&inst_buf);
     var cur_block: *Block = &main_block;
 
-    const panic_id: Zcu.SimplePanicId = if (dest_is_enum) .invalid_enum_value else .cast_truncated_data;
+    const panic_id: Zcu.SimplePanicId = if (dest_is_enum) .invalid_enum_value else .integer_out_of_bounds;
 
     if (have_min_check or have_max_check) {
         const dest_int_ty = if (dest_is_enum) dest_ty.intTagType(zcu) else dest_ty;
diff --git a/src/Sema.zig b/src/Sema.zig
index 3c4fc555cb..f051a62af3 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -10263,7 +10263,7 @@ fn zirIntCast(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air
     const dest_ty = try sema.resolveDestType(block, src, extra.lhs, .remove_eu_opt, "@intCast");
     const operand = try sema.resolveInst(extra.rhs);
 
-    return sema.intCast(block, block.nodeOffset(inst_data.src_node), dest_ty, src, operand, operand_src, true, false);
+    return sema.intCast(block, block.nodeOffset(inst_data.src_node), dest_ty, src, operand, operand_src);
 }
 
 fn intCast(
@@ -10274,8 +10274,6 @@ fn intCast(
     dest_ty_src: LazySrcLoc,
     operand: Air.Inst.Ref,
     operand_src: LazySrcLoc,
-    runtime_safety: bool,
-    safety_panics_are_enum: bool,
 ) CompileError!Air.Inst.Ref {
     const pt = sema.pt;
     const zcu = pt.zcu;
@@ -10294,7 +10292,7 @@ fn intCast(
 
     if ((try sema.typeHasOnePossibleValue(dest_ty))) |opv| {
         // requirement: intCast(u0, input) iff input == 0
-        if (runtime_safety and block.wantSafety()) {
+        if (block.wantSafety()) {
             try sema.requireRuntimeBlock(block, src, operand_src);
             const wanted_info = dest_scalar_ty.intInfo(zcu);
             const wanted_bits = wanted_info.bits;
@@ -10311,7 +10309,7 @@ fn intCast(
                     const is_in_range = try block.addBinOp(.cmp_lte, operand, zero_inst);
                     break :ok is_in_range;
                 };
-                try sema.addSafetyCheck(block, src, ok, if (safety_panics_are_enum) .invalid_enum_value else .cast_truncated_data);
+                try sema.addSafetyCheck(block, src, ok, .integer_out_of_bounds);
             }
         }
 
@@ -10319,10 +10317,9 @@ fn intCast(
     }
 
     try sema.requireRuntimeBlock(block, src, operand_src);
-    if (runtime_safety and block.wantSafety()) {
+    if (block.wantSafety()) {
         if (zcu.backendSupportsFeature(.panic_fn)) {
-            _ = try sema.preparePanicId(src, .negative_to_unsigned);
-            _ = try sema.preparePanicId(src, .cast_truncated_data);
+            _ = try sema.preparePanicId(src, .integer_out_of_bounds);
         }
         return block.addTyOp(.intcast_safe, dest_ty, operand);
     }
@@ -37984,8 +37981,7 @@ fn getExpectedBuiltinFnType(sema: *Sema, decl: Zcu.BuiltinDecl) CompileError!Typ
         .@"panic.castToNull",
         .@"panic.incorrectAlignment",
         .@"panic.invalidErrorCode",
-        .@"panic.castTruncatedData",
-        .@"panic.negativeToUnsigned",
+        .@"panic.integerOutOfBounds",
         .@"panic.integerOverflow",
         .@"panic.shlOverflow",
         .@"panic.shrOverflow",
diff --git a/src/Zcu.zig b/src/Zcu.zig
index 15e53aa202..20fafb6c4d 100644
--- a/src/Zcu.zig
+++ b/src/Zcu.zig
@@ -441,8 +441,7 @@ pub const BuiltinDecl = enum {
     @"panic.castToNull",
     @"panic.incorrectAlignment",
     @"panic.invalidErrorCode",
-    @"panic.castTruncatedData",
-    @"panic.negativeToUnsigned",
+    @"panic.integerOutOfBounds",
     @"panic.integerOverflow",
     @"panic.shlOverflow",
     @"panic.shrOverflow",
@@ -518,8 +517,7 @@ pub const BuiltinDecl = enum {
             .@"panic.castToNull",
             .@"panic.incorrectAlignment",
             .@"panic.invalidErrorCode",
-            .@"panic.castTruncatedData",
-            .@"panic.negativeToUnsigned",
+            .@"panic.integerOutOfBounds",
             .@"panic.integerOverflow",
             .@"panic.shlOverflow",
             .@"panic.shrOverflow",
@@ -585,8 +583,7 @@ pub const SimplePanicId = enum {
     cast_to_null,
     incorrect_alignment,
     invalid_error_code,
-    cast_truncated_data,
-    negative_to_unsigned,
+    integer_out_of_bounds,
     integer_overflow,
     shl_overflow,
     shr_overflow,
@@ -609,8 +606,7 @@ pub const SimplePanicId = enum {
             .cast_to_null               => .@"panic.castToNull",
             .incorrect_alignment        => .@"panic.incorrectAlignment",
             .invalid_error_code         => .@"panic.invalidErrorCode",
-            .cast_truncated_data        => .@"panic.castTruncatedData",
-            .negative_to_unsigned       => .@"panic.negativeToUnsigned",
+            .integer_out_of_bounds      => .@"panic.integerOutOfBounds",
             .integer_overflow           => .@"panic.integerOverflow",
             .shl_overflow               => .@"panic.shlOverflow",
             .shr_overflow               => .@"panic.shrOverflow",
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 960d5f819b..268a57417b 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -9189,11 +9189,7 @@ pub const FuncGen = struct {
             const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
             assert(is_vector == (dest_ty.zigTypeTag(zcu) == .vector));
 
-            const min_panic_id: Zcu.SimplePanicId, const max_panic_id: Zcu.SimplePanicId = id: {
-                if (dest_is_enum) break :id .{ .invalid_enum_value, .invalid_enum_value };
-                if (dest_info.signedness == .unsigned) break :id .{ .negative_to_unsigned, .cast_truncated_data };
-                break :id .{ .cast_truncated_data, .cast_truncated_data };
-            };
+            const panic_id: Zcu.SimplePanicId = if (dest_is_enum) .invalid_enum_value else .integer_out_of_bounds;
 
             if (have_min_check) {
                 const min_const_scalar = try minIntConst(&o.builder, dest_scalar, operand_scalar_llvm_ty, zcu);
@@ -9207,7 +9203,7 @@ pub const FuncGen = struct {
                 const ok_block = try fg.wip.block(1, "IntMinOk");
                 _ = try fg.wip.brCond(ok, ok_block, fail_block, .none);
                 fg.wip.cursor = .{ .block = fail_block };
-                try fg.buildSimplePanic(min_panic_id);
+                try fg.buildSimplePanic(panic_id);
                 fg.wip.cursor = .{ .block = ok_block };
             }
 
@@ -9223,7 +9219,7 @@ pub const FuncGen = struct {
                 const ok_block = try fg.wip.block(1, "IntMaxOk");
                 _ = try fg.wip.brCond(ok, ok_block, fail_block, .none);
                 fg.wip.cursor = .{ .block = fail_block };
-                try fg.buildSimplePanic(max_panic_id);
+                try fg.buildSimplePanic(panic_id);
                 fg.wip.cursor = .{ .block = ok_block };
             }
         }
diff --git a/test/cases/compile_errors/bad_panic_call_signature.zig b/test/cases/compile_errors/bad_panic_call_signature.zig
index 1af0fdeb17..6d88f1b878 100644
--- a/test/cases/compile_errors/bad_panic_call_signature.zig
+++ b/test/cases/compile_errors/bad_panic_call_signature.zig
@@ -15,8 +15,7 @@ pub const panic = struct {
     pub const castToNull = simple_panic.castToNull;
     pub const incorrectAlignment = simple_panic.incorrectAlignment;
     pub const invalidErrorCode = simple_panic.invalidErrorCode;
-    pub const castTruncatedData = simple_panic.castTruncatedData;
-    pub const negativeToUnsigned = simple_panic.negativeToUnsigned;
+    pub const integerOutOfBounds = simple_panic.integerOutOfBounds;
     pub const integerOverflow = simple_panic.integerOverflow;
     pub const shlOverflow = simple_panic.shlOverflow;
     pub const shrOverflow = simple_panic.shrOverflow;
@@ -27,8 +26,6 @@ pub const panic = struct {
     pub const shiftRhsTooBig = simple_panic.shiftRhsTooBig;
     pub const invalidEnumValue = simple_panic.invalidEnumValue;
     pub const forLenMismatch = simple_panic.forLenMismatch;
-    /// Delete after next zig1.wasm update
-    pub const memcpyLenMismatch = copyLenMismatch;
     pub const copyLenMismatch = simple_panic.copyLenMismatch;
     pub const memcpyAlias = simple_panic.memcpyAlias;
     pub const noreturnReturned = simple_panic.noreturnReturned;
diff --git a/test/cases/compile_errors/bad_panic_generic_signature.zig b/test/cases/compile_errors/bad_panic_generic_signature.zig
index 9373551359..8ef4810745 100644
--- a/test/cases/compile_errors/bad_panic_generic_signature.zig
+++ b/test/cases/compile_errors/bad_panic_generic_signature.zig
@@ -11,8 +11,7 @@ pub const panic = struct {
     pub const castToNull = simple_panic.castToNull;
     pub const incorrectAlignment = simple_panic.incorrectAlignment;
     pub const invalidErrorCode = simple_panic.invalidErrorCode;
-    pub const castTruncatedData = simple_panic.castTruncatedData;
-    pub const negativeToUnsigned = simple_panic.negativeToUnsigned;
+    pub const integerOutOfBounds = simple_panic.integerOutOfBounds;
     pub const integerOverflow = simple_panic.integerOverflow;
     pub const shlOverflow = simple_panic.shlOverflow;
     pub const shrOverflow = simple_panic.shrOverflow;
@@ -23,8 +22,6 @@ pub const panic = struct {
     pub const shiftRhsTooBig = simple_panic.shiftRhsTooBig;
     pub const invalidEnumValue = simple_panic.invalidEnumValue;
     pub const forLenMismatch = simple_panic.forLenMismatch;
-    /// Delete after next zig1.wasm update
-    pub const memcpyLenMismatch = copyLenMismatch;
     pub const copyLenMismatch = simple_panic.copyLenMismatch;
     pub const memcpyAlias = simple_panic.memcpyAlias;
     pub const noreturnReturned = simple_panic.noreturnReturned;
diff --git a/test/cases/safety/@intCast to u0.zig b/test/cases/safety/@intCast to u0.zig
index 1637a859ad..4394f63f54 100644
--- a/test/cases/safety/@intCast to u0.zig	
+++ b/test/cases/safety/@intCast to u0.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "integer cast truncated bits")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/signed integer not fitting in cast to unsigned integer - widening.zig b/test/cases/safety/signed integer not fitting in cast to unsigned integer - widening.zig
index 3ee2f1fefa..fa0eec94c0 100644
--- a/test/cases/safety/signed integer not fitting in cast to unsigned integer - widening.zig	
+++ b/test/cases/safety/signed integer not fitting in cast to unsigned integer - widening.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "attempt to cast negative value to unsigned integer")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/signed integer not fitting in cast to unsigned integer.zig b/test/cases/safety/signed integer not fitting in cast to unsigned integer.zig
index 44402c329e..6ce662cdc7 100644
--- a/test/cases/safety/signed integer not fitting in cast to unsigned integer.zig	
+++ b/test/cases/safety/signed integer not fitting in cast to unsigned integer.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "attempt to cast negative value to unsigned integer")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/signed-unsigned vector cast.zig b/test/cases/safety/signed-unsigned vector cast.zig
index f4da258f28..919562b06c 100644
--- a/test/cases/safety/signed-unsigned vector cast.zig	
+++ b/test/cases/safety/signed-unsigned vector cast.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "attempt to cast negative value to unsigned integer")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/truncating vector cast.zig b/test/cases/safety/truncating vector cast.zig
index 463d4a11bd..9b222e6918 100644
--- a/test/cases/safety/truncating vector cast.zig	
+++ b/test/cases/safety/truncating vector cast.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "integer cast truncated bits")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/unsigned integer not fitting in cast to signed integer - same bit count.zig b/test/cases/safety/unsigned integer not fitting in cast to signed integer - same bit count.zig
index 6a3f0c08a6..185cde9973 100644
--- a/test/cases/safety/unsigned integer not fitting in cast to signed integer - same bit count.zig	
+++ b/test/cases/safety/unsigned integer not fitting in cast to signed integer - same bit count.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "integer cast truncated bits")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/unsigned-signed vector cast.zig b/test/cases/safety/unsigned-signed vector cast.zig
index 7465a4f9cd..6501643b36 100644
--- a/test/cases/safety/unsigned-signed vector cast.zig	
+++ b/test/cases/safety/unsigned-signed vector cast.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "integer cast truncated bits")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/value does not fit in shortening cast - u0.zig b/test/cases/safety/value does not fit in shortening cast - u0.zig
index 3644437ea1..f29df8d8af 100644
--- a/test/cases/safety/value does not fit in shortening cast - u0.zig	
+++ b/test/cases/safety/value does not fit in shortening cast - u0.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "integer cast truncated bits")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/cases/safety/value does not fit in shortening cast.zig b/test/cases/safety/value does not fit in shortening cast.zig
index b48c4698fa..415ac95dbb 100644
--- a/test/cases/safety/value does not fit in shortening cast.zig	
+++ b/test/cases/safety/value does not fit in shortening cast.zig	
@@ -2,7 +2,7 @@ const std = @import("std");
 
 pub fn panic(message: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn {
     _ = stack_trace;
-    if (std.mem.eql(u8, message, "integer cast truncated bits")) {
+    if (std.mem.eql(u8, message, "integer does not fit in destination type")) {
         std.process.exit(0);
     }
     std.process.exit(1);
diff --git a/test/incremental/change_panic_handler_explicit b/test/incremental/change_panic_handler_explicit
index 322773fd47..ad5d3d124a 100644
--- a/test/incremental/change_panic_handler_explicit
+++ b/test/incremental/change_panic_handler_explicit
@@ -26,8 +26,7 @@ pub const panic = struct {
     pub const castToNull = no_panic.castToNull;
     pub const incorrectAlignment = no_panic.incorrectAlignment;
     pub const invalidErrorCode = no_panic.invalidErrorCode;
-    pub const castTruncatedData = no_panic.castTruncatedData;
-    pub const negativeToUnsigned = no_panic.negativeToUnsigned;
+    pub const integerOutOfBounds = no_panic.integerOutOfBounds;
     pub const shlOverflow = no_panic.shlOverflow;
     pub const shrOverflow = no_panic.shrOverflow;
     pub const divideByZero = no_panic.divideByZero;
@@ -37,8 +36,6 @@ pub const panic = struct {
     pub const shiftRhsTooBig = no_panic.shiftRhsTooBig;
     pub const invalidEnumValue = no_panic.invalidEnumValue;
     pub const forLenMismatch = no_panic.forLenMismatch;
-    /// Delete after next zig1.wasm update
-    pub const memcpyLenMismatch = copyLenMismatch;
     pub const copyLenMismatch = no_panic.copyLenMismatch;
     pub const memcpyAlias = no_panic.memcpyAlias;
     pub const noreturnReturned = no_panic.noreturnReturned;
@@ -75,8 +72,7 @@ pub const panic = struct {
     pub const castToNull = no_panic.castToNull;
     pub const incorrectAlignment = no_panic.incorrectAlignment;
     pub const invalidErrorCode = no_panic.invalidErrorCode;
-    pub const castTruncatedData = no_panic.castTruncatedData;
-    pub const negativeToUnsigned = no_panic.negativeToUnsigned;
+    pub const integerOutOfBounds = no_panic.integerOutOfBounds;
     pub const shlOverflow = no_panic.shlOverflow;
     pub const shrOverflow = no_panic.shrOverflow;
     pub const divideByZero = no_panic.divideByZero;
@@ -86,8 +82,6 @@ pub const panic = struct {
     pub const shiftRhsTooBig = no_panic.shiftRhsTooBig;
     pub const invalidEnumValue = no_panic.invalidEnumValue;
     pub const forLenMismatch = no_panic.forLenMismatch;
-    /// Delete after next zig1.wasm update
-    pub const memcpyLenMismatch = copyLenMismatch;
     pub const copyLenMismatch = no_panic.copyLenMismatch;
     pub const memcpyAlias = no_panic.memcpyAlias;
     pub const noreturnReturned = no_panic.noreturnReturned;
@@ -124,8 +118,7 @@ pub const panic = struct {
     pub const castToNull = no_panic.castToNull;
     pub const incorrectAlignment = no_panic.incorrectAlignment;
     pub const invalidErrorCode = no_panic.invalidErrorCode;
-    pub const castTruncatedData = no_panic.castTruncatedData;
-    pub const negativeToUnsigned = no_panic.negativeToUnsigned;
+    pub const integerOutOfBounds = no_panic.integerOutOfBounds;
     pub const shlOverflow = no_panic.shlOverflow;
     pub const shrOverflow = no_panic.shrOverflow;
     pub const divideByZero = no_panic.divideByZero;
@@ -135,8 +128,6 @@ pub const panic = struct {
     pub const shiftRhsTooBig = no_panic.shiftRhsTooBig;
     pub const invalidEnumValue = no_panic.invalidEnumValue;
     pub const forLenMismatch = no_panic.forLenMismatch;
-    /// Delete after next zig1.wasm update
-    pub const memcpyLenMismatch = copyLenMismatch;
     pub const copyLenMismatch = no_panic.copyLenMismatch;
     pub const memcpyAlias = no_panic.memcpyAlias;
     pub const noreturnReturned = no_panic.noreturnReturned;
-- 
cgit v1.2.3