From 36df1526da0e703a9f3d5bd6c8775d3f0e0f0a33 Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Fri, 9 Apr 2021 13:43:42 +0800
Subject: stage2 x86_64: refactor codegen to use inst encoder

There are parts of it that I didn't modify because the byte
representation was important (e.g. we need to know what the exact
byte position where we store the address into the offset table is)
---
 src/codegen.zig | 490 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 332 insertions(+), 158 deletions(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index a345fd8058..783c152595 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -1034,7 +1034,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         },
                         .val = Value.initTag(.bool_true),
                     };
-                    return try self.genX8664BinMath(&inst.base, inst.operand, &imm.base, 6, 0x30);
+                    return try self.genX8664BinMath(&inst.base, inst.operand, &imm.base);
                 },
                 .arm, .armeb => {
                     var imm = ir.Inst.Constant{
@@ -1058,7 +1058,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 return MCValue.dead;
             switch (arch) {
                 .x86_64 => {
-                    return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 0, 0x00);
+                    return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs);
                 },
                 .arm, .armeb => return try self.genArmBinOp(&inst.base, inst.lhs, inst.rhs, .add),
                 else => return self.fail(inst.base.src, "TODO implement add for {}", .{self.target.cpu.arch}),
@@ -1352,7 +1352,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 return MCValue.dead;
             switch (arch) {
                 .x86_64 => {
-                    return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 5, 0x28);
+                    return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs);
                 },
                 .arm, .armeb => return try self.genArmBinOp(&inst.base, inst.lhs, inst.rhs, .sub),
                 else => return self.fail(inst.base.src, "TODO implement sub for {}", .{self.target.cpu.arch}),
@@ -1497,8 +1497,14 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return dst_mcv;
         }
 
+        /// Perform "binary" operators, excluding comparisons.
+        /// Currently, the following ops are supported:
         /// ADD, SUB, XOR, OR, AND
-        fn genX8664BinMath(self: *Self, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst, opx: u8, mr: u8) !MCValue {
+        fn genX8664BinMath(self: *Self, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst) !MCValue {
+            // We'll handle these ops in two steps.
+            // 1) Prepare an output register, and put one of the arguments in it
+            // 2) Perform the op with the other argument
+
             try self.code.ensureCapacity(self.code.items.len + 8);
 
             const lhs = try self.resolveInst(op_lhs);
@@ -1559,18 +1565,108 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 else => {},
             }
 
-            try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, opx, mr);
+            // Now for step 2, we perform the actual op
+            switch (inst.tag) {
+                // TODO: Generate wrapping and non-wrapping versions separately
+                .add, .addwrap => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 0, 0x00),
+                .bool_or, .bit_or => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 1, 0x08),
+                .bool_and, .bit_and => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 4, 0x20),
+                .sub, .subwrap => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 5, 0x28),
+                .xor, .not => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 6, 0x30),
+
+                else => unreachable,
+            }
 
             return dst_mcv;
         }
 
+        /// Wrap over Instruction.encodeInto to translate errors
+        fn encodeX8664Instruction(
+            self: *Self,
+            src: LazySrcLoc,
+            inst: Instruction,
+        ) !void {
+            inst.encodeInto(self.code) catch |err| {
+                if (err == error.OutOfMemory)
+                    return error.OutOfMemory
+                else
+                    return self.fail(src, "Instruction.encodeInto failed because {s}", .{@errorName(err)});
+            };
+        }
+
+        /// This function encodes a binary operation for x86_64
+        /// intended for use with the following opcode ranges
+        /// because they share the same structure.
+        ///
+        /// Thus not all binary operations can be used here
+        /// -- multiplication needs to be done with imul,
+        /// which doesn't have as convenient an interface.
+        ///
+        /// "opx"-style instructions use the opcode extension field to indicate which instruction to execute:
+        ///
+        /// opx = /0: add
+        /// opx = /1: or
+        /// opx = /2: adc
+        /// opx = /3: sbb
+        /// opx = /4: and
+        /// opx = /5: sub
+        /// opx = /6: xor
+        /// opx = /7: cmp
+        ///
+        /// opcode  | operand shape
+        /// --------+----------------------
+        /// 80 /opx | r/m8,        imm8
+        /// 81 /opx | r/m16/32/64, imm16/32
+        /// 83 /opx | r/m16/32/64, imm8
+        ///
+        /// "mr"-style instructions use the low bits of opcode to indicate shape of instruction:
+        ///
+        /// mr = 00: add
+        /// mr = 08: or
+        /// mr = 10: adc
+        /// mr = 18: sbb
+        /// mr = 20: and
+        /// mr = 28: sub
+        /// mr = 30: xor
+        /// mr = 38: cmp
+        ///
+        /// opcode | operand shape
+        /// -------+-------------------------
+        /// mr + 0 | r/m8,        r8
+        /// mr + 1 | r/m16/32/64, r16/32/64
+        /// mr + 2 | r8,          r/m8
+        /// mr + 3 | r16/32/64,   r/m16/32/64
+        /// mr + 4 | AL,          imm8
+        /// mr + 5 | rAX,         imm16/32
+        ///
+        /// TODO: rotates and shifts share the same structure, so we can potentially implement them
+        ///       at a later date with very similar code.
+        ///       They have "opx"-style instructions, but no "mr"-style instructions.
+        ///
+        /// opx = /0: rol,
+        /// opx = /1: ror,
+        /// opx = /2: rcl,
+        /// opx = /3: rcr,
+        /// opx = /4: shl sal,
+        /// opx = /5: shr,
+        /// opx = /6: sal shl,
+        /// opx = /7: sar,
+        ///
+        /// opcode  | operand shape
+        /// --------+------------------
+        /// c0 /opx | r/m8,        imm8
+        /// c1 /opx | r/m16/32/64, imm8
+        /// d0 /opx | r/m8,        1
+        /// d1 /opx | r/m16/32/64, 1
+        /// d2 /opx | r/m8,        CL    (for context, CL is register 1)
+        /// d3 /opx | r/m16/32/64, CL    (for context, CL is register 1)
         fn genX8664BinMathCode(
             self: *Self,
             src: LazySrcLoc,
             dst_ty: Type,
             dst_mcv: MCValue,
             src_mcv: MCValue,
-            opx: u8,
+            opx: u3,
             mr: u8,
         ) !void {
             switch (dst_mcv) {
@@ -1589,31 +1685,78 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         .ptr_stack_offset => unreachable,
                         .ptr_embedded_in_code => unreachable,
                         .register => |src_reg| {
-                            self.rex(.{ .b = dst_reg.isExtended(), .r = src_reg.isExtended(), .w = dst_reg.size() == 64 });
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ mr + 0x1, 0xC0 | (@as(u8, src_reg.id() & 0b111) << 3) | @as(u8, dst_reg.id() & 0b111) });
+                            // register, register use mr + 1 addressing mode: r/m16/32/64, r16/32/64
+                            try self.encodeX8664Instruction(src, Instruction{
+                                .operand_size_64 = dst_reg.size() == 64,
+                                .primary_opcode_1b = mr + 1,
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
+                                ),
+                                .reg = src_reg,
+                            });
                         },
                         .immediate => |imm| {
+                            // register, immediate use opx = 81 or 83 addressing modes:
+                            // opx = 81: r/m16/32/64, imm16/32
+                            // opx = 83: r/m16/32/64, imm8
                             const imm32 = @intCast(u31, imm); // This case must be handled before calling genX8664BinMathCode.
-                            // 81 /opx id
                             if (imm32 <= math.maxInt(u7)) {
-                                self.rex(.{ .b = dst_reg.isExtended(), .w = dst_reg.size() == 64 });
-                                self.code.appendSliceAssumeCapacity(&[_]u8{
-                                    0x83,
-                                    0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()),
-                                    @intCast(u8, imm32),
+                                try self.encodeX8664Instruction(src, Instruction{
+                                    .operand_size_64 = dst_reg.size() == 64,
+                                    .primary_opcode_1b = 0x83,
+                                    .opcode_extension = opx,
+                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                    //       https://github.com/ziglang/zig/issues/6515
+                                    .modrm = @as(
+                                        ?Instruction.ModrmEffectiveAddress,
+                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
+                                    ),
+                                    .immediate_bytes = 1,
+                                    .immediate = imm32,
                                 });
                             } else {
-                                self.rex(.{ .r = dst_reg.isExtended(), .w = dst_reg.size() == 64 });
-                                self.code.appendSliceAssumeCapacity(&[_]u8{
-                                    0x81,
-                                    0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()),
+                                try self.encodeX8664Instruction(src, Instruction{
+                                    .operand_size_64 = dst_reg.size() == 64,
+                                    .primary_opcode_1b = 0x81,
+                                    .opcode_extension = opx,
+                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                    //       https://github.com/ziglang/zig/issues/6515
+                                    .modrm = @as(
+                                        ?Instruction.ModrmEffectiveAddress,
+                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
+                                    ),
+                                    .immediate_bytes = 4,
+                                    .immediate = imm32,
                                 });
-                                std.mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), imm32);
                             }
                         },
-                        .embedded_in_code, .memory, .stack_offset => {
+                        .embedded_in_code, .memory => {
                             return self.fail(src, "TODO implement x86 ADD/SUB/CMP source memory", .{});
                         },
+                        .stack_offset => |off| {
+                            const abi_size = dst_ty.abiSize(self.target.*);
+                            const adj_off = off + abi_size;
+                            if (off > math.maxInt(i32)) {
+                                return self.fail(src, "stack offset too large", .{});
+                            }
+                            try self.encodeX8664Instruction(src, Instruction{
+                                .operand_size_64 = dst_reg.size() == 64,
+                                .primary_opcode_1b = mr + 0x3,
+                                .reg = dst_reg,
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .mem_disp = .{
+                                        .reg = Register.ebp,
+                                        .disp = -@intCast(i32, adj_off),
+                                    } },
+                                ),
+                            });
+                        },
                         .compare_flags_unsigned => {
                             return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (unsigned)", .{});
                         },
@@ -1655,25 +1798,23 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         fn genX8664ModRMRegToStack(self: *Self, src: LazySrcLoc, ty: Type, off: u32, reg: Register, opcode: u8) !void {
             const abi_size = ty.abiSize(self.target.*);
             const adj_off = off + abi_size;
-            try self.code.ensureCapacity(self.code.items.len + 7);
-            self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() });
-            const reg_id: u8 = @truncate(u3, reg.id());
-            if (adj_off <= 128) {
-                // example: 48 89 55 7f           mov    QWORD PTR [rbp+0x7f],rdx
-                const RM = @as(u8, 0b01_000_101) | (reg_id << 3);
-                const negative_offset = @intCast(i8, -@intCast(i32, adj_off));
-                const twos_comp = @bitCast(u8, negative_offset);
-                self.code.appendSliceAssumeCapacity(&[_]u8{ opcode, RM, twos_comp });
-            } else if (adj_off <= 2147483648) {
-                // example: 48 89 95 80 00 00 00  mov    QWORD PTR [rbp+0x80],rdx
-                const RM = @as(u8, 0b10_000_101) | (reg_id << 3);
-                const negative_offset = @intCast(i32, -@intCast(i33, adj_off));
-                const twos_comp = @bitCast(u32, negative_offset);
-                self.code.appendSliceAssumeCapacity(&[_]u8{ opcode, RM });
-                mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), twos_comp);
-            } else {
+            if (off > math.maxInt(i32)) {
                 return self.fail(src, "stack offset too large", .{});
             }
+            try self.encodeX8664Instruction(src, Instruction{
+                .operand_size_64 = reg.size() == 64,
+                .primary_opcode_1b = opcode,
+                .reg = reg,
+                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                //       https://github.com/ziglang/zig/issues/6515
+                .modrm = @as(
+                    ?Instruction.ModrmEffectiveAddress,
+                    Instruction.ModrmEffectiveAddress{ .mem_disp = .{
+                        .reg = Register.ebp,
+                        .disp = -@intCast(i32, adj_off),
+                    } },
+                ),
+            });
         }
 
         fn genArgDbgInfo(self: *Self, inst: *ir.Inst.Arg, mcv: MCValue) !void {
@@ -2340,15 +2481,24 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         },
                         .register => |reg| blk: {
                             // test reg, 1
-                            // TODO detect al, ax, eax
-                            try self.code.ensureCapacity(self.code.items.len + 4);
-                            // TODO audit this codegen: we force w = true here to make
-                            // the value affect the big register
-                            self.rex(.{ .b = reg.isExtended(), .w = true });
-                            self.code.appendSliceAssumeCapacity(&[_]u8{
-                                0xf6,
-                                @as(u8, 0xC0) | (0 << 3) | @truncate(u3, reg.id()),
-                                0x01,
+                            try self.encodeX8664Instruction(inst.base.src, Instruction{
+                                // TODO audit this codegen: we force w = true here to make
+                                // the value affect the big register
+                                .operand_size_64 = true,
+
+                                .primary_opcode_1b = 0xf6, // f6/0 is TEST r/m8, imm8
+                                .opcode_extension = 0,
+
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                // TODO detect al, ax, eax, there's another opcode 0xa8 for that
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .reg = reg },
+                                ),
+
+                                .immediate_bytes = 1,
+                                .immediate = 1,
                             });
                             break :blk 0x84;
                         },
@@ -2662,9 +2812,9 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             switch (arch) {
                 .x86_64 => switch (inst.base.tag) {
                     // lhs AND rhs
-                    .bool_and => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 4, 0x20),
+                    .bool_and => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs),
                     // lhs OR rhs
-                    .bool_or => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 1, 0x08),
+                    .bool_or => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs),
                     else => unreachable, // Not a boolean operation
                 },
                 .arm, .armeb => switch (inst.base.tag) {
@@ -3451,20 +3601,27 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         }
                     },
                     .compare_flags_unsigned => |op| {
-                        try self.code.ensureCapacity(self.code.items.len + 3);
-                        // TODO audit this codegen: we force w = true here to make
-                        // the value affect the big register
-                        self.rex(.{ .b = reg.isExtended(), .w = true });
-                        const opcode: u8 = switch (op) {
-                            .gte => 0x93,
-                            .gt => 0x97,
-                            .neq => 0x95,
-                            .lt => 0x92,
-                            .lte => 0x96,
-                            .eq => 0x94,
-                        };
-                        const id = @as(u8, reg.id() & 0b111);
-                        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode, 0xC0 | id });
+                        try self.encodeX8664Instruction(src, Instruction{
+                            // TODO audit this codegen: we force w = true here to make
+                            // the value affect the big register
+                            .operand_size_64 = true,
+
+                            .primary_opcode_2b = switch (op) {
+                                .gte => 0x93,
+                                .gt => 0x97,
+                                .neq => 0x95,
+                                .lt => 0x92,
+                                .lte => 0x96,
+                                .eq => 0x94,
+                            },
+
+                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                            //       https://github.com/ziglang/zig/issues/6515
+                            .modrm = @as(
+                                ?Instruction.ModrmEffectiveAddress,
+                                Instruction.ModrmEffectiveAddress{ .reg = reg },
+                            ),
+                        });
                     },
                     .compare_flags_signed => |op| {
                         return self.fail(src, "TODO set register with compare flags value (signed)", .{});
@@ -3476,38 +3633,32 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // The encoding for `xor r32, r32` is `0x31 /r`.
                             // Section 3.1.1.1 of the Intel x64 Manual states that "/r indicates that the
                             // ModR/M byte of the instruction contains a register operand and an r/m operand."
-                            //
-                            // R/M bytes are composed of two bits for the mode, then three bits for the register,
-                            // then three bits for the operand. Since we're zeroing a register, the two three-bit
-                            // values will be identical, and the mode is three (the raw register value).
-                            //
-                            // If we're accessing e.g. r8d, we need to use a REX prefix before the actual operation. Since
-                            // this is a 32-bit operation, the W flag is set to zero. X is also zero, as we're not using a SIB.
-                            // Both R and B are set, as we're extending, in effect, the register bits *and* the operand.
-                            try self.code.ensureCapacity(self.code.items.len + 3);
-                            self.rex(.{ .r = reg.isExtended(), .b = reg.isExtended() });
-                            const id = @as(u8, reg.id() & 0b111);
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x31, 0xC0 | id << 3 | id });
+                            try self.encodeX8664Instruction(src, Instruction{
+                                .primary_opcode_1b = 0x31,
+
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                .reg = @as(?Register, reg),
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .reg = reg },
+                                ),
+                            });
                             return;
                         }
                         if (x <= math.maxInt(u32)) {
                             // Next best case: if we set the lower four bytes, the upper four will be zeroed.
                             //
                             // The encoding for `mov IMM32 -> REG` is (0xB8 + R) IMM.
-                            if (reg.isExtended()) {
-                                // Just as with XORing, we need a REX prefix. This time though, we only
-                                // need the B bit set, as we're extending the opcode's register field,
-                                // and there is no Mod R/M byte.
-                                //
-                                // Thus, we need b01000001, or 0x41.
-                                try self.code.resize(self.code.items.len + 6);
-                                self.code.items[self.code.items.len - 6] = 0x41;
-                            } else {
-                                try self.code.resize(self.code.items.len + 5);
-                            }
-                            self.code.items[self.code.items.len - 5] = 0xB8 | @as(u8, reg.id() & 0b111);
-                            const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4];
-                            mem.writeIntLittle(u32, imm_ptr, @intCast(u32, x));
+                            try self.encodeX8664Instruction(src, Instruction{
+                                // B8 + R
+                                .primary_opcode_1b = 0xB8,
+                                .opcode_reg = @as(?Register, reg),
+
+                                // IMM32
+                                .immediate_bytes = 4,
+                                .immediate = x,
+                            });
                             return;
                         }
                         // Worst case: we need to load the 64-bit register with the IMM. GNU's assemblers calls
@@ -3517,50 +3668,58 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         // This encoding is, in fact, the *same* as the one used for 32-bit loads. The only
                         // difference is that we set REX.W before the instruction, which extends the load to
                         // 64-bit and uses the full bit-width of the register.
-                        //
-                        // Since we always need a REX here, let's just check if we also need to set REX.B.
-                        //
-                        // In this case, the encoding of the REX byte is 0b0100100B
-                        try self.code.ensureCapacity(self.code.items.len + 10);
-                        self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() });
-                        self.code.items.len += 9;
-                        self.code.items[self.code.items.len - 9] = 0xB8 | @as(u8, reg.id() & 0b111);
-                        const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8];
-                        mem.writeIntLittle(u64, imm_ptr, x);
+                        try self.encodeX8664Instruction(src, Instruction{
+                            .operand_size_64 = true,
+                            // B8 + R
+                            .primary_opcode_1b = 0xB8,
+                            .opcode_reg = @as(?Register, reg),
+
+                            // IMM64
+                            .immediate_bytes = 8,
+                            .immediate = x,
+                        });
                     },
                     .embedded_in_code => |code_offset| {
-                        // We need the offset from RIP in a signed i32 twos complement.
-                        // The instruction is 7 bytes long and RIP points to the next instruction.
-                        try self.code.ensureCapacity(self.code.items.len + 7);
-                        // 64-bit LEA is encoded as REX.W 8D /r. If the register is extended, the REX byte is modified,
-                        // but the operation size is unchanged. Since we're using a disp32, we want mode 0 and lower three
-                        // bits as five.
-                        // REX 0x8D 0b00RRR101, where RRR is the lower three bits of the id.
-                        self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() });
-                        self.code.items.len += 6;
+                        // 64-bit LEA is encoded as REX.W 8D /r.
                         const rip = self.code.items.len;
                         const big_offset = @intCast(i64, code_offset) - @intCast(i64, rip);
                         const offset = @intCast(i32, big_offset);
-                        self.code.items[self.code.items.len - 6] = 0x8D;
-                        self.code.items[self.code.items.len - 5] = 0b101 | (@as(u8, reg.id() & 0b111) << 3);
-                        const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4];
-                        mem.writeIntLittle(i32, imm_ptr, offset);
+                        try self.encodeX8664Instruction(src, Instruction{
+                            .operand_size_64 = true,
+
+                            // LEA
+                            .primary_opcode_1b = 0x8D,
+
+                            .reg = reg,
+
+                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                            //       https://github.com/ziglang/zig/issues/6515
+                            .modrm = @as(
+                                ?Instruction.ModrmEffectiveAddress,
+                                Instruction.ModrmEffectiveAddress{ .disp32 = @bitCast(i32, offset) },
+                            ),
+                        });
                     },
                     .register => |src_reg| {
                         // If the registers are the same, nothing to do.
                         if (src_reg.id() == reg.id())
                             return;
 
-                        // This is a variant of 8B /r. Since we're using 64-bit moves, we require a REX.
-                        // This is thus three bytes: REX 0x8B R/M.
-                        // If the destination is extended, the R field must be 1.
-                        // If the *source* is extended, the B field must be 1.
-                        // Since the register is being accessed directly, the R/M mode is three. The reg field (the middle
-                        // three bits) contain the destination, and the R/M field (the lower three bits) contain the source.
-                        try self.code.ensureCapacity(self.code.items.len + 3);
-                        self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended(), .b = src_reg.isExtended() });
-                        const R = 0xC0 | (@as(u8, reg.id() & 0b111) << 3) | @as(u8, src_reg.id() & 0b111);
-                        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, R });
+                        // This is a variant of 8B /r.
+                        try self.encodeX8664Instruction(src, Instruction{
+                            .operand_size_64 = reg.size() == 64,
+
+                            .primary_opcode_1b = 0x8B,
+
+                            .reg = reg,
+
+                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                            //       https://github.com/ziglang/zig/issues/6515
+                            .modrm = @as(
+                                ?Instruction.ModrmEffectiveAddress,
+                                Instruction.ModrmEffectiveAddress{ .reg = src_reg },
+                            ),
+                        });
                     },
                     .memory => |x| {
                         if (self.bin_file.options.pie) {
@@ -3577,6 +3736,9 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             } else {
                                 return self.fail(src, "TODO implement genSetReg for PIE GOT indirection on this platform", .{});
                             }
+
+                            // LEA reg, [<offset>]
+                            // manually do this instruction to make sure the offset into the disp32 field won't change.
                             try self.code.ensureCapacity(self.code.items.len + 7);
                             self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() });
                             self.code.appendSliceAssumeCapacity(&[_]u8{
@@ -3585,10 +3747,21 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             });
                             mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), 0);
 
-                            try self.code.ensureCapacity(self.code.items.len + 3);
-                            self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended(), .r = reg.isExtended() });
-                            const RM = (@as(u8, reg.id() & 0b111) << 3) | @truncate(u3, reg.id());
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, RM });
+                            // MOV reg, [reg]
+                            try self.encodeX8664Instruction(src, Instruction{
+                                .operand_size_64 = reg.size() == 64,
+
+                                .primary_opcode_1b = 0x8B,
+
+                                .reg = reg,
+
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .mem = reg },
+                                ),
+                            });
                         } else if (x <= math.maxInt(u32)) {
                             // Moving from memory to a register is a variant of `8B /r`.
                             // Since we're using 64-bit moves, we require a REX.
@@ -3612,12 +3785,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 // REX.W 0xA1 moffs64*
                                 // moffs64* is a 64-bit offset "relative to segment base", which really just means the
                                 // absolute address for all practical purposes.
-                                try self.code.resize(self.code.items.len + 10);
-                                // REX.W == 0x48
-                                self.code.items[self.code.items.len - 10] = 0x48;
-                                self.code.items[self.code.items.len - 9] = 0xA1;
-                                const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8];
-                                mem.writeIntLittle(u64, imm_ptr, x);
+
+                                try self.encodeX8664Instruction(src, Instruction{
+                                    .operand_size_64 = true,
+                                    .primary_opcode_1b = 0xa1,
+                                    .immediate_bytes = 8,
+                                    .immediate = x,
+                                });
                             } else {
                                 // This requires two instructions; a move imm as used above, followed by an indirect load using the register
                                 // as the address and the register as the destination.
@@ -3634,41 +3808,41 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 // Now, the register contains the address of the value to load into it
                                 // Currently, we're only allowing 64-bit registers, so we need the `REX.W 8B /r` variant.
                                 // TODO: determine whether to allow other sized registers, and if so, handle them properly.
-                                // This operation requires three bytes: REX 0x8B R/M
-                                try self.code.ensureCapacity(self.code.items.len + 3);
-                                // For this operation, we want R/M mode *zero* (use register indirectly), and the two register
-                                // values must match. Thus, it's 00ABCABC where ABC is the lower three bits of the register ID.
-                                //
-                                // Furthermore, if this is an extended register, both B and R must be set in the REX byte, as *both*
-                                // register operands need to be marked as extended.
-                                self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended(), .r = reg.isExtended() });
-                                const RM = (@as(u8, reg.id() & 0b111) << 3) | @truncate(u3, reg.id());
-                                self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, RM });
+                                try self.encodeX8664Instruction(src, Instruction{
+                                    .operand_size_64 = reg.size() == 64,
+                                    .primary_opcode_1b = 0x8B,
+                                    .reg = reg,
+                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                    //       https://github.com/ziglang/zig/issues/6515
+                                    .modrm = @as(
+                                        ?Instruction.ModrmEffectiveAddress,
+                                        Instruction.ModrmEffectiveAddress{ .mem = reg },
+                                    ),
+                                });
                             }
                         }
                     },
                     .stack_offset => |unadjusted_off| {
-                        try self.code.ensureCapacity(self.code.items.len + 7);
                         const size_bytes = @divExact(reg.size(), 8);
                         const off = unadjusted_off + size_bytes;
-                        self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() });
-                        const reg_id: u8 = @truncate(u3, reg.id());
-                        if (off <= 128) {
-                            // Example: 48 8b 4d 7f           mov    rcx,QWORD PTR [rbp+0x7f]
-                            const RM = @as(u8, 0b01_000_101) | (reg_id << 3);
-                            const negative_offset = @intCast(i8, -@intCast(i32, off));
-                            const twos_comp = @bitCast(u8, negative_offset);
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8b, RM, twos_comp });
-                        } else if (off <= 2147483648) {
-                            // Example: 48 8b 8d 80 00 00 00  mov    rcx,QWORD PTR [rbp+0x80]
-                            const RM = @as(u8, 0b10_000_101) | (reg_id << 3);
-                            const negative_offset = @intCast(i32, -@intCast(i33, off));
-                            const twos_comp = @bitCast(u32, negative_offset);
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8b, RM });
-                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), twos_comp);
-                        } else {
+                        if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) {
                             return self.fail(src, "stack offset too large", .{});
                         }
+                        const ioff = -@intCast(i32, off);
+                        try self.encodeX8664Instruction(src, Instruction{
+                            .operand_size_64 = reg.size() == 64,
+                            .primary_opcode_1b = 0x8B,
+                            .reg = reg,
+                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                            //       https://github.com/ziglang/zig/issues/6515
+                            .modrm = @as(
+                                ?Instruction.ModrmEffectiveAddress,
+                                Instruction.ModrmEffectiveAddress{ .mem_disp = .{
+                                    .reg = Register.ebp,
+                                    .disp = ioff,
+                                } },
+                            ),
+                        });
                     },
                 },
                 else => return self.fail(src, "TODO implement getSetReg for {}", .{self.target.cpu.arch}),
-- 
cgit v1.2.3


From 5bd464e386df35bfe38b062190074ce3c2689001 Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Fri, 9 Apr 2021 14:05:53 +0800
Subject: stage2 x86_64: use abi size to determine 64-bit operation

From my very cursory reading, it seems that the register manager doesn't
distinguish between registers that are physically the same but have
different sizes.

In that case, this means that during codegen, we can't rely on
`reg.size()` when determining the width of the operations we have to
perform. Instead, we must use some form of `ty.abiSize(self.target.*)`
to determine the size of the type we're operating with. If this size is
64 bits, then we should enable 64-bit operation.

This fixed a bug in the codegen for spilling instructions, which was
overwriting the previous stack entry with zeroes. See the modified test
case in this commit.
---
 src/codegen.zig      | 26 +++++++++++++-------------
 test/stage2/test.zig | 28 +++++++++++++++-------------
 2 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index 783c152595..6739acbfa6 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -1687,7 +1687,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         .register => |src_reg| {
                             // register, register use mr + 1 addressing mode: r/m16/32/64, r16/32/64
                             try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = dst_reg.size() == 64,
+                                .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
                                 .primary_opcode_1b = mr + 1,
                                 // TODO: Explicit optional wrap due to stage 1 miscompilation :(
                                 //       https://github.com/ziglang/zig/issues/6515
@@ -1705,7 +1705,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             const imm32 = @intCast(u31, imm); // This case must be handled before calling genX8664BinMathCode.
                             if (imm32 <= math.maxInt(u7)) {
                                 try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = dst_reg.size() == 64,
+                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
                                     .primary_opcode_1b = 0x83,
                                     .opcode_extension = opx,
                                     // TODO: Explicit optional wrap due to stage 1 miscompilation :(
@@ -1719,7 +1719,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 });
                             } else {
                                 try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = dst_reg.size() == 64,
+                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
                                     .primary_opcode_1b = 0x81,
                                     .opcode_extension = opx,
                                     // TODO: Explicit optional wrap due to stage 1 miscompilation :(
@@ -1743,7 +1743,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 return self.fail(src, "stack offset too large", .{});
                             }
                             try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = dst_reg.size() == 64,
+                                .operand_size_64 = abi_size == 64,
                                 .primary_opcode_1b = mr + 0x3,
                                 .reg = dst_reg,
                                 // TODO: Explicit optional wrap due to stage 1 miscompilation :(
@@ -1802,7 +1802,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 return self.fail(src, "stack offset too large", .{});
             }
             try self.encodeX8664Instruction(src, Instruction{
-                .operand_size_64 = reg.size() == 64,
+                .operand_size_64 = abi_size == 64,
                 .primary_opcode_1b = opcode,
                 .reg = reg,
                 // TODO: Explicit optional wrap due to stage 1 miscompilation :(
@@ -3707,7 +3707,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
 
                         // This is a variant of 8B /r.
                         try self.encodeX8664Instruction(src, Instruction{
-                            .operand_size_64 = reg.size() == 64,
+                            .operand_size_64 = ty.abiSize(self.target.*) == 64,
 
                             .primary_opcode_1b = 0x8B,
 
@@ -3740,7 +3740,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // LEA reg, [<offset>]
                             // manually do this instruction to make sure the offset into the disp32 field won't change.
                             try self.code.ensureCapacity(self.code.items.len + 7);
-                            self.rex(.{ .w = reg.size() == 64, .r = reg.isExtended() });
+                            self.rex(.{ .w = ty.abiSize(self.target.*) == 64, .r = reg.isExtended() });
                             self.code.appendSliceAssumeCapacity(&[_]u8{
                                 0x8D,
                                 0x05 | (@as(u8, reg.id() & 0b111) << 3),
@@ -3749,7 +3749,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
 
                             // MOV reg, [reg]
                             try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = reg.size() == 64,
+                                .operand_size_64 = ty.abiSize(self.target.*) == 64,
 
                                 .primary_opcode_1b = 0x8B,
 
@@ -3771,7 +3771,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // 0b00RRR100, where RRR is the lower three bits of the register ID.
                             // The instruction is thus eight bytes; REX 0x8B 0b00RRR100 0x25 followed by a four-byte disp32.
                             try self.code.ensureCapacity(self.code.items.len + 8);
-                            self.rex(.{ .w = reg.size() == 64, .b = reg.isExtended() });
+                            self.rex(.{ .w = ty.abiSize(self.target.*) == 64, .r = reg.isExtended() });
                             self.code.appendSliceAssumeCapacity(&[_]u8{
                                 0x8B,
                                 0x04 | (@as(u8, reg.id() & 0b111) << 3), // R
@@ -3809,7 +3809,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 // Currently, we're only allowing 64-bit registers, so we need the `REX.W 8B /r` variant.
                                 // TODO: determine whether to allow other sized registers, and if so, handle them properly.
                                 try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = reg.size() == 64,
+                                    .operand_size_64 = ty.abiSize(self.target.*) == 64,
                                     .primary_opcode_1b = 0x8B,
                                     .reg = reg,
                                     // TODO: Explicit optional wrap due to stage 1 miscompilation :(
@@ -3823,14 +3823,14 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         }
                     },
                     .stack_offset => |unadjusted_off| {
-                        const size_bytes = @divExact(reg.size(), 8);
-                        const off = unadjusted_off + size_bytes;
+                        const abi_size = ty.abiSize(self.target.*);
+                        const off = unadjusted_off + abi_size;
                         if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) {
                             return self.fail(src, "stack offset too large", .{});
                         }
                         const ioff = -@intCast(i32, off);
                         try self.encodeX8664Instruction(src, Instruction{
-                            .operand_size_64 = reg.size() == 64,
+                            .operand_size_64 = ty.abiSize(self.target.*) == 64,
                             .primary_opcode_1b = 0x8B,
                             .reg = reg,
                             // TODO: Explicit optional wrap due to stage 1 miscompilation :(
diff --git a/test/stage2/test.zig b/test/stage2/test.zig
index b4bc1a413e..ca1302b9dc 100644
--- a/test/stage2/test.zig
+++ b/test/stage2/test.zig
@@ -740,7 +740,7 @@ pub fn addCases(ctx: *TestContext) !void {
         // Spilling registers to the stack.
         case.addCompareOutput(
             \\export fn _start() noreturn {
-            \\    assert(add(3, 4) == 791);
+            \\    assert(add(3, 4) == 1221);
             \\
             \\    exit();
             \\}
@@ -756,19 +756,21 @@ pub fn addCases(ctx: *TestContext) !void {
             \\        const i = g + h; // 100
             \\        const j = i + d; // 110
             \\        const k = i + j; // 210
-            \\        const l = k + c; // 217
-            \\        const m = l + d; // 227
-            \\        const n = m + e; // 241
-            \\        const o = n + f; // 265
-            \\        const p = o + g; // 303
-            \\        const q = p + h; // 365
-            \\        const r = q + i; // 465
-            \\        const s = r + j; // 575
-            \\        const t = s + k; // 785
-            \\        break :blk t;
+            \\        const l = j + k; // 320
+            \\        const m = l + c; // 327
+            \\        const n = m + d; // 337
+            \\        const o = n + e; // 351
+            \\        const p = o + f; // 375
+            \\        const q = p + g; // 413
+            \\        const r = q + h; // 475
+            \\        const s = r + i; // 575
+            \\        const t = s + j; // 685
+            \\        const u = t + k; // 895
+            \\        const v = u + l; // 1215
+            \\        break :blk v;
             \\    };
-            \\    const y = x + a; // 788
-            \\    const z = y + a; // 791
+            \\    const y = x + a; // 1218
+            \\    const z = y + a; // 1221
             \\    return z;
             \\}
             \\
-- 
cgit v1.2.3


From c4b83ea02102611a85f75b189f0803d9b6a335c2 Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Fri, 9 Apr 2021 13:51:00 +0800
Subject: stage2 x86_64: implement integer mul

This was also an experiment to see if it were easier to implement a new
feature when using the instruction encoder.

Verdict: It's not that much easier, but I think it's certainly much more
readable, because the description of the Instruction annotates what each
field means. Right now, precise knowledge of x86_64 instructions is
still required because things like when to set the 64-bit flag, how to
read x86_64 instruction references, etc. are still not automatically
done for you.

In the future, this interface might make it sligtly easier to write an
assembler for x86_64, by abstracting the bit-fiddling aspects of
instruction encoding.
---
 src/Module.zig       |  60 +++++++++++++++++++++
 src/Sema.zig         |   7 +++
 src/codegen.zig      | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++
 test/stage2/test.zig | 102 +++++++++++++++++++++++++++++++++++
 4 files changed, 318 insertions(+)

(limited to 'src/codegen.zig')

diff --git a/src/Module.zig b/src/Module.zig
index 96b490e2a1..90e1a71bd2 100644
--- a/src/Module.zig
+++ b/src/Module.zig
@@ -4330,6 +4330,33 @@ pub fn intSub(allocator: *Allocator, lhs: Value, rhs: Value) !Value {
     }
 }
 
+pub fn intMul(allocator: *Allocator, lhs: Value, rhs: Value) !Value {
+    // TODO is this a performance issue? maybe we should try the operation without
+    // resorting to BigInt first.
+    var lhs_space: Value.BigIntSpace = undefined;
+    var rhs_space: Value.BigIntSpace = undefined;
+    const lhs_bigint = lhs.toBigInt(&lhs_space);
+    const rhs_bigint = rhs.toBigInt(&rhs_space);
+    const limbs = try allocator.alloc(
+        std.math.big.Limb,
+        lhs_bigint.limbs.len + rhs_bigint.limbs.len + 1,
+    );
+    var result_bigint = BigIntMutable{ .limbs = limbs, .positive = undefined, .len = undefined };
+    var limbs_buffer = try allocator.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcMulLimbsBufferLen(lhs_bigint.limbs.len, rhs_bigint.limbs.len, 1),
+    );
+    defer allocator.free(limbs_buffer);
+    result_bigint.mul(lhs_bigint, rhs_bigint, limbs_buffer, allocator);
+    const result_limbs = result_bigint.limbs[0..result_bigint.len];
+
+    if (result_bigint.positive) {
+        return Value.Tag.int_big_positive.create(allocator, result_limbs);
+    } else {
+        return Value.Tag.int_big_negative.create(allocator, result_limbs);
+    }
+}
+
 pub fn floatAdd(
     arena: *Allocator,
     float_type: Type,
@@ -4396,6 +4423,39 @@ pub fn floatSub(
     }
 }
 
+pub fn floatMul(
+    arena: *Allocator,
+    float_type: Type,
+    src: LazySrcLoc,
+    lhs: Value,
+    rhs: Value,
+) !Value {
+    switch (float_type.tag()) {
+        .f16 => {
+            @panic("TODO add __trunctfhf2 to compiler-rt");
+            //const lhs_val = lhs.toFloat(f16);
+            //const rhs_val = rhs.toFloat(f16);
+            //return Value.Tag.float_16.create(arena, lhs_val * rhs_val);
+        },
+        .f32 => {
+            const lhs_val = lhs.toFloat(f32);
+            const rhs_val = rhs.toFloat(f32);
+            return Value.Tag.float_32.create(arena, lhs_val * rhs_val);
+        },
+        .f64 => {
+            const lhs_val = lhs.toFloat(f64);
+            const rhs_val = rhs.toFloat(f64);
+            return Value.Tag.float_64.create(arena, lhs_val * rhs_val);
+        },
+        .f128, .comptime_float, .c_longdouble => {
+            const lhs_val = lhs.toFloat(f128);
+            const rhs_val = rhs.toFloat(f128);
+            return Value.Tag.float_128.create(arena, lhs_val * rhs_val);
+        },
+        else => unreachable,
+    }
+}
+
 pub fn simplePtrType(
     mod: *Module,
     arena: *Allocator,
diff --git a/src/Sema.zig b/src/Sema.zig
index 98bff5bf23..74af84b078 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -3885,6 +3885,13 @@ fn analyzeArithmetic(
                         try Module.floatSub(sema.arena, scalar_type, src, lhs_val, rhs_val);
                     break :blk val;
                 },
+                .mul => blk: {
+                    const val = if (is_int)
+                        try Module.intMul(sema.arena, lhs_val, rhs_val)
+                    else
+                        try Module.floatMul(sema.arena, scalar_type, src, lhs_val, rhs_val);
+                    break :blk val;
+                },
                 else => return sema.mod.fail(&block.base, src, "TODO Implement arithmetic operand '{s}'", .{@tagName(zir_tag)}),
             };
 
diff --git a/src/codegen.zig b/src/codegen.zig
index 6739acbfa6..2f49e10522 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -1079,6 +1079,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             if (inst.base.isUnused())
                 return MCValue.dead;
             switch (arch) {
+                .x86_64 => return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs),
                 .arm, .armeb => return try self.genArmMul(&inst.base, inst.lhs, inst.rhs),
                 else => return self.fail(inst.base.src, "TODO implement mul for {}", .{self.target.cpu.arch}),
             }
@@ -1574,6 +1575,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 .sub, .subwrap => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 5, 0x28),
                 .xor, .not => try self.genX8664BinMathCode(inst.src, inst.ty, dst_mcv, src_mcv, 6, 0x30),
 
+                .mul, .mulwrap => try self.genX8664Imul(inst.src, inst.ty, dst_mcv, src_mcv),
                 else => unreachable,
             }
 
@@ -1795,6 +1797,153 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
+        /// Performs integer multiplication between dst_mcv and src_mcv, storing the result in dst_mcv.
+        fn genX8664Imul(
+            self: *Self,
+            src: LazySrcLoc,
+            dst_ty: Type,
+            dst_mcv: MCValue,
+            src_mcv: MCValue,
+        ) !void {
+            switch (dst_mcv) {
+                .none => unreachable,
+                .undef => unreachable,
+                .dead, .unreach, .immediate => unreachable,
+                .compare_flags_unsigned => unreachable,
+                .compare_flags_signed => unreachable,
+                .ptr_stack_offset => unreachable,
+                .ptr_embedded_in_code => unreachable,
+                .register => |dst_reg| {
+                    switch (src_mcv) {
+                        .none => unreachable,
+                        .undef => try self.genSetReg(src, dst_ty, dst_reg, .undef),
+                        .dead, .unreach => unreachable,
+                        .ptr_stack_offset => unreachable,
+                        .ptr_embedded_in_code => unreachable,
+                        .register => |src_reg| {
+                            // register, register
+                            //
+                            // Use the following imul opcode
+                            // 0F AF /r: IMUL r32/64, r/m32/64
+                            try self.encodeX8664Instruction(src, Instruction{
+                                .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
+                                .primary_opcode_2b = 0xaf,
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .reg = src_reg },
+                                ),
+                                .reg = dst_reg,
+                            });
+                        },
+                        .immediate => |imm| {
+                            // register, immediate:
+                            // depends on size of immediate.
+                            //
+                            // immediate fits in i8:
+                            // 6B /r ib: IMUL r32/64, r/m32/64, imm8
+                            //
+                            // immediate fits in i32:
+                            // 69 /r id: IMUL r32/64, r/m32/64, imm32
+                            //
+                            // immediate is huge:
+                            // split into 2 instructions
+                            // 1) copy the 64 bit immediate into a tmp register
+                            // 2) perform register,register mul
+                            // 0F AF /r: IMUL r32/64, r/m32/64
+                            if (math.minInt(i8) <= imm and imm <= math.maxInt(i8)) {
+                                try self.encodeX8664Instruction(src, Instruction{
+                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
+                                    .primary_opcode_1b = 0x6B,
+                                    .reg = dst_reg,
+                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                    //       https://github.com/ziglang/zig/issues/6515
+                                    .modrm = @as(
+                                        ?Instruction.ModrmEffectiveAddress,
+                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
+                                    ),
+                                    .immediate_bytes = 1,
+                                    .immediate = imm,
+                                });
+                            } else if (math.minInt(i32) <= imm and imm <= math.maxInt(i32)) {
+                                try self.encodeX8664Instruction(src, Instruction{
+                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
+                                    .primary_opcode_1b = 0x69,
+                                    .reg = dst_reg,
+                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                    //       https://github.com/ziglang/zig/issues/6515
+                                    .modrm = @as(
+                                        ?Instruction.ModrmEffectiveAddress,
+                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
+                                    ),
+                                    .immediate_bytes = 4,
+                                    .immediate = imm,
+                                });
+                            } else {
+                                const src_reg = try self.copyToTmpRegister(src, dst_ty, src_mcv);
+                                return self.genX8664Imul(src, dst_ty, dst_mcv, MCValue{ .register = src_reg });
+                            }
+                        },
+                        .embedded_in_code, .memory, .stack_offset => {
+                            return self.fail(src, "TODO implement x86 multiply source memory", .{});
+                        },
+                        .compare_flags_unsigned => {
+                            return self.fail(src, "TODO implement x86 multiply source compare flag (unsigned)", .{});
+                        },
+                        .compare_flags_signed => {
+                            return self.fail(src, "TODO implement x86 multiply source compare flag (signed)", .{});
+                        },
+                    }
+                },
+                .stack_offset => |off| {
+                    switch (src_mcv) {
+                        .none => unreachable,
+                        .undef => return self.genSetStack(src, dst_ty, off, .undef),
+                        .dead, .unreach => unreachable,
+                        .ptr_stack_offset => unreachable,
+                        .ptr_embedded_in_code => unreachable,
+                        .register => |src_reg| {
+                            // copy dst to a register
+                            const dst_reg = try self.copyToTmpRegister(src, dst_ty, dst_mcv);
+                            // multiply into dst_reg
+                            // register, register
+                            // Use the following imul opcode
+                            // 0F AF /r: IMUL r32/64, r/m32/64
+                            try self.encodeX8664Instruction(src, Instruction{
+                                .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
+                                .primary_opcode_2b = 0xaf,
+                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
+                                //       https://github.com/ziglang/zig/issues/6515
+                                .modrm = @as(
+                                    ?Instruction.ModrmEffectiveAddress,
+                                    Instruction.ModrmEffectiveAddress{ .reg = src_reg },
+                                ),
+                                .reg = dst_reg,
+                            });
+                            // copy dst_reg back out
+                            return self.genSetStack(src, dst_ty, off, MCValue{ .register = dst_reg });
+                        },
+                        .immediate => |imm| {
+                            return self.fail(src, "TODO implement x86 multiply source immediate", .{});
+                        },
+                        .embedded_in_code, .memory, .stack_offset => {
+                            return self.fail(src, "TODO implement x86 multiply source memory", .{});
+                        },
+                        .compare_flags_unsigned => {
+                            return self.fail(src, "TODO implement x86 multiply source compare flag (unsigned)", .{});
+                        },
+                        .compare_flags_signed => {
+                            return self.fail(src, "TODO implement x86 multiply source compare flag (signed)", .{});
+                        },
+                    }
+                },
+                .embedded_in_code, .memory => {
+                    return self.fail(src, "TODO implement x86 multiply destination memory", .{});
+                },
+            }
+        }
+
         fn genX8664ModRMRegToStack(self: *Self, src: LazySrcLoc, ty: Type, off: u32, reg: Register, opcode: u8) !void {
             const abi_size = ty.abiSize(self.target.*);
             const adj_off = off + abi_size;
diff --git a/test/stage2/test.zig b/test/stage2/test.zig
index ca1302b9dc..ffcf562322 100644
--- a/test/stage2/test.zig
+++ b/test/stage2/test.zig
@@ -358,6 +358,81 @@ pub fn addCases(ctx: *TestContext) !void {
         , &[_][]const u8{":2:15: error: incompatible types: 'bool' and 'comptime_int'"});
     }
 
+    {
+        var case = ctx.exe("multiplying numbers at runtime and comptime", linux_x64);
+        case.addCompareOutput(
+            \\export fn _start() noreturn {
+            \\    mul(3, 4);
+            \\
+            \\    exit();
+            \\}
+            \\
+            \\fn mul(a: u32, b: u32) void {
+            \\    if (a * b != 12) unreachable;
+            \\}
+            \\
+            \\fn exit() noreturn {
+            \\    asm volatile ("syscall"
+            \\        :
+            \\        : [number] "{rax}" (231),
+            \\          [arg1] "{rdi}" (0)
+            \\        : "rcx", "r11", "memory"
+            \\    );
+            \\    unreachable;
+            \\}
+        ,
+            "",
+        );
+        // comptime function call
+        case.addCompareOutput(
+            \\export fn _start() noreturn {
+            \\    exit();
+            \\}
+            \\
+            \\fn mul(a: u32, b: u32) u32 {
+            \\    return a * b;
+            \\}
+            \\
+            \\const x = mul(3, 4);
+            \\
+            \\fn exit() noreturn {
+            \\    asm volatile ("syscall"
+            \\        :
+            \\        : [number] "{rax}" (231),
+            \\          [arg1] "{rdi}" (x - 12)
+            \\        : "rcx", "r11", "memory"
+            \\    );
+            \\    unreachable;
+            \\}
+        ,
+            "",
+        );
+        // Inline function call
+        case.addCompareOutput(
+            \\export fn _start() noreturn {
+            \\    var x: usize = 5;
+            \\    const y = mul(2, 3, x);
+            \\    exit(y - 30);
+            \\}
+            \\
+            \\fn mul(a: usize, b: usize, c: usize) callconv(.Inline) usize {
+            \\    return a * b * c;
+            \\}
+            \\
+            \\fn exit(code: usize) noreturn {
+            \\    asm volatile ("syscall"
+            \\        :
+            \\        : [number] "{rax}" (231),
+            \\          [arg1] "{rdi}" (code)
+            \\        : "rcx", "r11", "memory"
+            \\    );
+            \\    unreachable;
+            \\}
+        ,
+            "",
+        );
+    }
+
     {
         var case = ctx.exe("assert function", linux_x64);
         case.addCompareOutput(
@@ -741,6 +816,7 @@ pub fn addCases(ctx: *TestContext) !void {
         case.addCompareOutput(
             \\export fn _start() noreturn {
             \\    assert(add(3, 4) == 1221);
+            \\    assert(mul(3, 4) == 21609);
             \\
             \\    exit();
             \\}
@@ -774,6 +850,32 @@ pub fn addCases(ctx: *TestContext) !void {
             \\    return z;
             \\}
             \\
+            \\fn mul(a: u32, b: u32) u32 {
+            \\    const x: u32 = blk: {
+            \\        const c = a * a * a * a; // 81
+            \\        const d = a * a * a * b; // 108
+            \\        const e = a * a * b * a; // 108
+            \\        const f = a * a * b * b; // 144
+            \\        const g = a * b * a * a; // 108
+            \\        const h = a * b * a * b; // 144
+            \\        const i = a * b * b * a; // 144
+            \\        const j = a * b * b * b; // 192
+            \\        const k = b * a * a * a; // 108
+            \\        const l = b * a * a * b; // 144
+            \\        const m = b * a * b * a; // 144
+            \\        const n = b * a * b * b; // 192
+            \\        const o = b * b * a * a; // 144
+            \\        const p = b * b * a * b; // 192
+            \\        const q = b * b * b * a; // 192
+            \\        const r = b * b * b * b; // 256
+            \\        const s = c + d + e + f + g + h + i + j + k + l + m + n + o + p + q + r; // 2401
+            \\        break :blk s;
+            \\    };
+            \\    const y = x * a; // 7203
+            \\    const z = y * a; // 21609
+            \\    return z;
+            \\}
+            \\
             \\pub fn assert(ok: bool) void {
             \\    if (!ok) unreachable; // assertion failure
             \\}
-- 
cgit v1.2.3


From 0409f9e0244aebab5c47f0ec24114e101c3f54e6 Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 11 Apr 2021 16:09:47 +0800
Subject: stage2 x86_64: simplify inst encoder to a set of dumb helper fns

---
 src/codegen.zig        | 596 ++++++++++++++++++++--------------------
 src/codegen/x86_64.zig | 728 +++++++++++++++++++++++++++----------------------
 2 files changed, 694 insertions(+), 630 deletions(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index 2f49e10522..27a60597d4 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -20,6 +20,8 @@ const build_options = @import("build_options");
 const LazySrcLoc = Module.LazySrcLoc;
 const RegisterManager = @import("register_manager.zig").RegisterManager;
 
+const X8664Encoder = @import("codegen/x86_64.zig").Encoder;
+
 /// The codegen-related data that is stored in `ir.Inst.Block` instructions.
 pub const BlockData = struct {
     relocs: std.ArrayListUnmanaged(Reloc) = undefined,
@@ -1617,9 +1619,9 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         ///
         /// opcode  | operand shape
         /// --------+----------------------
-        /// 80 /opx | r/m8,        imm8
-        /// 81 /opx | r/m16/32/64, imm16/32
-        /// 83 /opx | r/m16/32/64, imm8
+        /// 80 /opx | *r/m8*,        imm8
+        /// 81 /opx | *r/m16/32/64*, imm16/32
+        /// 83 /opx | *r/m16/32/64*, imm8
         ///
         /// "mr"-style instructions use the low bits of opcode to indicate shape of instruction:
         ///
@@ -1634,12 +1636,12 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         ///
         /// opcode | operand shape
         /// -------+-------------------------
-        /// mr + 0 | r/m8,        r8
-        /// mr + 1 | r/m16/32/64, r16/32/64
-        /// mr + 2 | r8,          r/m8
-        /// mr + 3 | r16/32/64,   r/m16/32/64
-        /// mr + 4 | AL,          imm8
-        /// mr + 5 | rAX,         imm16/32
+        /// mr + 0 | *r/m8*,        r8
+        /// mr + 1 | *r/m16/32/64*, r16/32/64
+        /// mr + 2 | *r8*,          r/m8
+        /// mr + 3 | *r16/32/64*,   r/m16/32/64
+        /// mr + 4 | *AL*,          imm8
+        /// mr + 5 | *rAX*,         imm16/32
         ///
         /// TODO: rotates and shifts share the same structure, so we can potentially implement them
         ///       at a later date with very similar code.
@@ -1656,12 +1658,12 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         ///
         /// opcode  | operand shape
         /// --------+------------------
-        /// c0 /opx | r/m8,        imm8
-        /// c1 /opx | r/m16/32/64, imm8
-        /// d0 /opx | r/m8,        1
-        /// d1 /opx | r/m16/32/64, 1
-        /// d2 /opx | r/m8,        CL    (for context, CL is register 1)
-        /// d3 /opx | r/m16/32/64, CL    (for context, CL is register 1)
+        /// c0 /opx | *r/m8*,        imm8
+        /// c1 /opx | *r/m16/32/64*, imm8
+        /// d0 /opx | *r/m8*,        1
+        /// d1 /opx | *r/m16/32/64*, 1
+        /// d2 /opx | *r/m8*,        CL    (for context, CL is register 1)
+        /// d3 /opx | *r/m16/32/64*, CL    (for context, CL is register 1)
         fn genX8664BinMathCode(
             self: *Self,
             src: LazySrcLoc,
@@ -1687,77 +1689,84 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         .ptr_stack_offset => unreachable,
                         .ptr_embedded_in_code => unreachable,
                         .register => |src_reg| {
-                            // register, register use mr + 1 addressing mode: r/m16/32/64, r16/32/64
-                            try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                .primary_opcode_1b = mr + 1,
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
-                                ),
-                                .reg = src_reg,
+                            // for register, register use mr + 1
+                            // addressing mode: *r/m16/32/64*, r16/32/64
+                            const operand_size = dst_ty.abiSize(self.target.*);
+                            const encoder = try X8664Encoder.init(self.code, 3);
+                            encoder.rex(.{
+                                .w = operand_size == 64,
+                                .r = src_reg.isExtended(),
+                                .b = dst_reg.isExtended(),
                             });
+                            encoder.opcode_1byte(mr + 1);
+                            encoder.modRm_direct(
+                                src_reg.low_id(),
+                                dst_reg.low_id(),
+                            );
                         },
                         .immediate => |imm| {
                             // register, immediate use opx = 81 or 83 addressing modes:
                             // opx = 81: r/m16/32/64, imm16/32
                             // opx = 83: r/m16/32/64, imm8
-                            const imm32 = @intCast(u31, imm); // This case must be handled before calling genX8664BinMathCode.
-                            if (imm32 <= math.maxInt(u7)) {
-                                try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                    .primary_opcode_1b = 0x83,
-                                    .opcode_extension = opx,
-                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                    //       https://github.com/ziglang/zig/issues/6515
-                                    .modrm = @as(
-                                        ?Instruction.ModrmEffectiveAddress,
-                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
-                                    ),
-                                    .immediate_bytes = 1,
-                                    .immediate = imm32,
+                            const imm32 = @intCast(i32, imm); // This case must be handled before calling genX8664BinMathCode.
+                            if (imm32 <= math.maxInt(i8)) {
+                                const operand_size = dst_ty.abiSize(self.target.*);
+                                const encoder = try X8664Encoder.init(self.code, 4);
+                                encoder.rex(.{
+                                    .w = operand_size == 64,
+                                    .b = dst_reg.isExtended(),
                                 });
+                                encoder.opcode_1byte(0x83);
+                                encoder.modRm_direct(
+                                    opx,
+                                    dst_reg.low_id(),
+                                );
+                                encoder.imm8(@intCast(i8, imm32));
                             } else {
-                                try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                    .primary_opcode_1b = 0x81,
-                                    .opcode_extension = opx,
-                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                    //       https://github.com/ziglang/zig/issues/6515
-                                    .modrm = @as(
-                                        ?Instruction.ModrmEffectiveAddress,
-                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
-                                    ),
-                                    .immediate_bytes = 4,
-                                    .immediate = imm32,
+                                const operand_size = dst_ty.abiSize(self.target.*);
+                                const encoder = try X8664Encoder.init(self.code, 7);
+                                encoder.rex(.{
+                                    .w = operand_size == 64,
+                                    .b = dst_reg.isExtended(),
                                 });
+                                encoder.opcode_1byte(0x81);
+                                encoder.modRm_direct(
+                                    opx,
+                                    dst_reg.low_id(),
+                                );
+                                encoder.imm32(@intCast(i32, imm32));
                             }
                         },
                         .embedded_in_code, .memory => {
                             return self.fail(src, "TODO implement x86 ADD/SUB/CMP source memory", .{});
                         },
                         .stack_offset => |off| {
+                            // register, indirect use mr + 3
+                            // addressing mode: *r16/32/64*, r/m16/32/64
                             const abi_size = dst_ty.abiSize(self.target.*);
                             const adj_off = off + abi_size;
                             if (off > math.maxInt(i32)) {
                                 return self.fail(src, "stack offset too large", .{});
                             }
-                            try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = abi_size == 64,
-                                .primary_opcode_1b = mr + 0x3,
-                                .reg = dst_reg,
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .mem_disp = .{
-                                        .reg = Register.ebp,
-                                        .disp = -@intCast(i32, adj_off),
-                                    } },
-                                ),
+                            const encoder = try X8664Encoder.init(self.code, 7);
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = dst_reg.isExtended(),
                             });
+                            encoder.opcode_1byte(mr + 3);
+                            if (adj_off <= std.math.maxInt(i8)) {
+                                encoder.modRm_indirectDisp8(
+                                    dst_reg.low_id(),
+                                    Register.ebp.low_id(),
+                                );
+                                encoder.disp8(-@intCast(i8, adj_off));
+                            } else {
+                                encoder.modRm_indirectDisp32(
+                                    dst_reg.low_id(),
+                                    Register.ebp.low_id(),
+                                );
+                                encoder.disp32(-@intCast(i32, adj_off));
+                            }
                         },
                         .compare_flags_unsigned => {
                             return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (unsigned)", .{});
@@ -1825,17 +1834,18 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             //
                             // Use the following imul opcode
                             // 0F AF /r: IMUL r32/64, r/m32/64
-                            try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                .primary_opcode_2b = 0xaf,
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .reg = src_reg },
-                                ),
-                                .reg = dst_reg,
+                            const abi_size = dst_ty.abiSize(self.target.*);
+                            const encoder = try X8664Encoder.init(self.code, 4);
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = dst_reg.isExtended(),
+                                .b = src_reg.isExtended(),
                             });
+                            encoder.opcode_2byte(0x0f, 0xaf);
+                            encoder.modRm_direct(
+                                dst_reg.low_id(),
+                                src_reg.low_id(),
+                            );
                         },
                         .immediate => |imm| {
                             // register, immediate:
@@ -1853,33 +1863,33 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // 2) perform register,register mul
                             // 0F AF /r: IMUL r32/64, r/m32/64
                             if (math.minInt(i8) <= imm and imm <= math.maxInt(i8)) {
-                                try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                    .primary_opcode_1b = 0x6B,
-                                    .reg = dst_reg,
-                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                    //       https://github.com/ziglang/zig/issues/6515
-                                    .modrm = @as(
-                                        ?Instruction.ModrmEffectiveAddress,
-                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
-                                    ),
-                                    .immediate_bytes = 1,
-                                    .immediate = imm,
+                                const abi_size = dst_ty.abiSize(self.target.*);
+                                const encoder = try X8664Encoder.init(self.code, 4);
+                                encoder.rex(.{
+                                    .w = abi_size == 64,
+                                    .r = dst_reg.isExtended(),
+                                    .b = dst_reg.isExtended(),
                                 });
+                                encoder.opcode_1byte(0x6B);
+                                encoder.modRm_direct(
+                                    dst_reg.low_id(),
+                                    dst_reg.low_id(),
+                                );
+                                encoder.imm8(@intCast(i8, imm));
                             } else if (math.minInt(i32) <= imm and imm <= math.maxInt(i32)) {
-                                try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                    .primary_opcode_1b = 0x69,
-                                    .reg = dst_reg,
-                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                    //       https://github.com/ziglang/zig/issues/6515
-                                    .modrm = @as(
-                                        ?Instruction.ModrmEffectiveAddress,
-                                        Instruction.ModrmEffectiveAddress{ .reg = dst_reg },
-                                    ),
-                                    .immediate_bytes = 4,
-                                    .immediate = imm,
+                                const abi_size = dst_ty.abiSize(self.target.*);
+                                const encoder = try X8664Encoder.init(self.code, 7);
+                                encoder.rex(.{
+                                    .w = abi_size == 64,
+                                    .r = dst_reg.isExtended(),
+                                    .b = dst_reg.isExtended(),
                                 });
+                                encoder.opcode_1byte(0x69);
+                                encoder.modRm_direct(
+                                    dst_reg.low_id(),
+                                    dst_reg.low_id(),
+                                );
+                                encoder.imm32(@intCast(i32, imm));
                             } else {
                                 const src_reg = try self.copyToTmpRegister(src, dst_ty, src_mcv);
                                 return self.genX8664Imul(src, dst_ty, dst_mcv, MCValue{ .register = src_reg });
@@ -1910,17 +1920,18 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // register, register
                             // Use the following imul opcode
                             // 0F AF /r: IMUL r32/64, r/m32/64
-                            try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = dst_ty.abiSize(self.target.*) == 64,
-                                .primary_opcode_2b = 0xaf,
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .reg = src_reg },
-                                ),
-                                .reg = dst_reg,
+                            const abi_size = dst_ty.abiSize(self.target.*);
+                            const encoder = try X8664Encoder.init(self.code, 4);
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = dst_reg.isExtended(),
+                                .b = src_reg.isExtended(),
                             });
+                            encoder.opcode_2byte(0x0f, 0xaf);
+                            encoder.modRm_direct(
+                                dst_reg.low_id(),
+                                src_reg.low_id(),
+                            );
                             // copy dst_reg back out
                             return self.genSetStack(src, dst_ty, off, MCValue{ .register = dst_reg });
                         },
@@ -1950,20 +1961,29 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             if (off > math.maxInt(i32)) {
                 return self.fail(src, "stack offset too large", .{});
             }
-            try self.encodeX8664Instruction(src, Instruction{
-                .operand_size_64 = abi_size == 64,
-                .primary_opcode_1b = opcode,
-                .reg = reg,
-                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                //       https://github.com/ziglang/zig/issues/6515
-                .modrm = @as(
-                    ?Instruction.ModrmEffectiveAddress,
-                    Instruction.ModrmEffectiveAddress{ .mem_disp = .{
-                        .reg = Register.ebp,
-                        .disp = -@intCast(i32, adj_off),
-                    } },
-                ),
+
+            const i_adj_off = -@intCast(i32, adj_off);
+            const encoder = try X8664Encoder.init(self.code, 7);
+            encoder.rex(.{
+                .w = abi_size == 64,
+                .r = reg.isExtended(),
             });
+            encoder.opcode_1byte(opcode);
+            if (i_adj_off < std.math.maxInt(i8)) {
+                // example: 48 89 55 7f           mov    QWORD PTR [rbp+0x7f],rdx
+                encoder.modRm_indirectDisp8(
+                    reg.low_id(),
+                    Register.ebp.low_id(),
+                );
+                encoder.disp8(@intCast(i8, i_adj_off));
+            } else {
+                // example: 48 89 95 80 00 00 00  mov    QWORD PTR [rbp+0x80],rdx
+                encoder.modRm_indirectDisp32(
+                    reg.low_id(),
+                    Register.ebp.low_id(),
+                );
+                encoder.disp32(i_adj_off);
+            }
         }
 
         fn genArgDbgInfo(self: *Self, inst: *ir.Inst.Arg, mcv: MCValue) !void {
@@ -2630,25 +2650,20 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         },
                         .register => |reg| blk: {
                             // test reg, 1
-                            try self.encodeX8664Instruction(inst.base.src, Instruction{
+                            // TODO detect al, ax, eax
+                            const encoder = try X8664Encoder.init(self.code, 4);
+                            encoder.rex(.{
                                 // TODO audit this codegen: we force w = true here to make
                                 // the value affect the big register
-                                .operand_size_64 = true,
-
-                                .primary_opcode_1b = 0xf6, // f6/0 is TEST r/m8, imm8
-                                .opcode_extension = 0,
-
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                // TODO detect al, ax, eax, there's another opcode 0xa8 for that
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .reg = reg },
-                                ),
-
-                                .immediate_bytes = 1,
-                                .immediate = 1,
+                                .w = true,
+                                .b = reg.isExtended(),
                             });
+                            encoder.opcode_1byte(0xf6);
+                            encoder.modRm_direct(
+                                0,
+                                reg.low_id(),
+                            );
+                            encoder.disp8(1);
                             break :blk 0x84;
                         },
                         else => return self.fail(inst.base.src, "TODO implement condbr {s} when condition is {s}", .{ self.target.cpu.arch, @tagName(cond) }),
@@ -3170,39 +3185,6 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
-        /// Encodes a REX prefix as specified, and appends it to the instruction
-        /// stream. This only modifies the instruction stream if at least one bit
-        /// is set true, which has a few implications:
-        ///
-        /// * The length of the instruction buffer will be modified *if* the
-        /// resulting REX is meaningful, but will remain the same if it is not.
-        /// * Deliberately inserting a "meaningless REX" requires explicit usage of
-        /// 0x40, and cannot be done via this function.
-        /// W => 64 bit mode
-        /// R => extension to the MODRM.reg field
-        /// X => extension to the SIB.index field
-        /// B => extension to the MODRM.rm field or the SIB.base field
-        fn rex(self: *Self, arg: struct { b: bool = false, w: bool = false, x: bool = false, r: bool = false }) void {
-            comptime assert(arch == .x86_64);
-            //  From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB.
-            var value: u8 = 0x40;
-            if (arg.b) {
-                value |= 0x1;
-            }
-            if (arg.x) {
-                value |= 0x2;
-            }
-            if (arg.r) {
-                value |= 0x4;
-            }
-            if (arg.w) {
-                value |= 0x8;
-            }
-            if (value != 0x40) {
-                self.code.appendAssumeCapacity(value);
-            }
-        }
-
         /// Sets the value without any modifications to register allocation metadata or stack allocation metadata.
         fn setRegOrMem(self: *Self, src: LazySrcLoc, ty: Type, loc: MCValue, val: MCValue) !void {
             switch (loc) {
@@ -3750,27 +3732,25 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         }
                     },
                     .compare_flags_unsigned => |op| {
-                        try self.encodeX8664Instruction(src, Instruction{
-                            // TODO audit this codegen: we force w = true here to make
-                            // the value affect the big register
-                            .operand_size_64 = true,
-
-                            .primary_opcode_2b = switch (op) {
-                                .gte => 0x93,
-                                .gt => 0x97,
-                                .neq => 0x95,
-                                .lt => 0x92,
-                                .lte => 0x96,
-                                .eq => 0x94,
-                            },
-
-                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                            //       https://github.com/ziglang/zig/issues/6515
-                            .modrm = @as(
-                                ?Instruction.ModrmEffectiveAddress,
-                                Instruction.ModrmEffectiveAddress{ .reg = reg },
-                            ),
+                        const encoder = try X8664Encoder.init(self.code, 7);
+                        // TODO audit this codegen: we force w = true here to make
+                        // the value affect the big register
+                        encoder.rex(.{
+                            .w = true,
+                            .b = reg.isExtended(),
                         });
+                        encoder.opcode_2byte(0x0f, switch (op) {
+                            .gte => 0x93,
+                            .gt => 0x97,
+                            .neq => 0x95,
+                            .lt => 0x92,
+                            .lte => 0x96,
+                            .eq => 0x94,
+                        });
+                        encoder.modRm_direct(
+                            0,
+                            reg.low_id(),
+                        );
                     },
                     .compare_flags_signed => |op| {
                         return self.fail(src, "TODO set register with compare flags value (signed)", .{});
@@ -3780,34 +3760,43 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         // register is the fastest way to zero a register.
                         if (x == 0) {
                             // The encoding for `xor r32, r32` is `0x31 /r`.
+                            const encoder = try X8664Encoder.init(self.code, 3);
+
+                            // If we're accessing e.g. r8d, we need to use a REX prefix before the actual operation. Since
+                            // this is a 32-bit operation, the W flag is set to zero. X is also zero, as we're not using a SIB.
+                            // Both R and B are set, as we're extending, in effect, the register bits *and* the operand.
+                            encoder.rex(.{
+                                .r = reg.isExtended(),
+                                .b = reg.isExtended(),
+                            });
+                            encoder.opcode_1byte(0x31);
                             // Section 3.1.1.1 of the Intel x64 Manual states that "/r indicates that the
                             // ModR/M byte of the instruction contains a register operand and an r/m operand."
-                            try self.encodeX8664Instruction(src, Instruction{
-                                .primary_opcode_1b = 0x31,
-
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                .reg = @as(?Register, reg),
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .reg = reg },
-                                ),
-                            });
+                            encoder.modRm_direct(
+                                reg.low_id(),
+                                reg.low_id(),
+                            );
+
                             return;
                         }
-                        if (x <= math.maxInt(u32)) {
+                        if (x <= math.maxInt(i32)) {
                             // Next best case: if we set the lower four bytes, the upper four will be zeroed.
                             //
                             // The encoding for `mov IMM32 -> REG` is (0xB8 + R) IMM.
-                            try self.encodeX8664Instruction(src, Instruction{
-                                // B8 + R
-                                .primary_opcode_1b = 0xB8,
-                                .opcode_reg = @as(?Register, reg),
-
-                                // IMM32
-                                .immediate_bytes = 4,
-                                .immediate = x,
+
+                            const encoder = try X8664Encoder.init(self.code, 6);
+                            // Just as with XORing, we need a REX prefix. This time though, we only
+                            // need the B bit set, as we're extending the opcode's register field,
+                            // and there is no Mod R/M byte.
+                            encoder.rex(.{
+                                .b = reg.isExtended(),
                             });
+                            encoder.opcode_withReg(0xB8, reg.low_id());
+
+                            // no ModR/M byte
+
+                            // IMM
+                            encoder.imm32(@intCast(i32, x));
                             return;
                         }
                         // Worst case: we need to load the 64-bit register with the IMM. GNU's assemblers calls
@@ -3817,37 +3806,40 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         // This encoding is, in fact, the *same* as the one used for 32-bit loads. The only
                         // difference is that we set REX.W before the instruction, which extends the load to
                         // 64-bit and uses the full bit-width of the register.
-                        try self.encodeX8664Instruction(src, Instruction{
-                            .operand_size_64 = true,
-                            // B8 + R
-                            .primary_opcode_1b = 0xB8,
-                            .opcode_reg = @as(?Register, reg),
-
-                            // IMM64
-                            .immediate_bytes = 8,
-                            .immediate = x,
-                        });
+                        {
+                            const encoder = try X8664Encoder.init(self.code, 10);
+                            encoder.rex(.{
+                                .w = true,
+                                .b = reg.isExtended(),
+                            });
+                            encoder.opcode_withReg(0xB8, reg.low_id());
+                            encoder.imm64(x);
+                        }
                     },
                     .embedded_in_code => |code_offset| {
+                        // We need the offset from RIP in a signed i32 twos complement.
+                        // The instruction is 7 bytes long and RIP points to the next instruction.
+
                         // 64-bit LEA is encoded as REX.W 8D /r.
-                        const rip = self.code.items.len;
+                        const rip = self.code.items.len + 7;
                         const big_offset = @intCast(i64, code_offset) - @intCast(i64, rip);
                         const offset = @intCast(i32, big_offset);
-                        try self.encodeX8664Instruction(src, Instruction{
-                            .operand_size_64 = true,
-
-                            // LEA
-                            .primary_opcode_1b = 0x8D,
+                        const encoder = try X8664Encoder.init(self.code, 7);
 
-                            .reg = reg,
-
-                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                            //       https://github.com/ziglang/zig/issues/6515
-                            .modrm = @as(
-                                ?Instruction.ModrmEffectiveAddress,
-                                Instruction.ModrmEffectiveAddress{ .disp32 = @bitCast(i32, offset) },
-                            ),
+                        // byte 1, always exists because w = true
+                        encoder.rex(.{
+                            .w = true,
+                            .r = reg.isExtended(),
                         });
+                        // byte 2
+                        encoder.opcode_1byte(0x8D);
+                        // byte 3
+                        encoder.modRm_RIPDisp32(reg.low_id());
+                        // byte 4-7
+                        encoder.disp32(offset);
+
+                        // Double check that we haven't done any math errors
+                        assert(rip == self.code.items.len);
                     },
                     .register => |src_reg| {
                         // If the registers are the same, nothing to do.
@@ -3855,20 +3847,15 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             return;
 
                         // This is a variant of 8B /r.
-                        try self.encodeX8664Instruction(src, Instruction{
-                            .operand_size_64 = ty.abiSize(self.target.*) == 64,
-
-                            .primary_opcode_1b = 0x8B,
-
-                            .reg = reg,
-
-                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                            //       https://github.com/ziglang/zig/issues/6515
-                            .modrm = @as(
-                                ?Instruction.ModrmEffectiveAddress,
-                                Instruction.ModrmEffectiveAddress{ .reg = src_reg },
-                            ),
+                        const abi_size = ty.abiSize(self.target.*);
+                        const encoder = try X8664Encoder.init(self.code, 3);
+                        encoder.rex(.{
+                            .w = abi_size == 64,
+                            .r = reg.isExtended(),
+                            .b = src_reg.isExtended(),
                         });
+                        encoder.opcode_1byte(0x8B);
+                        encoder.modRm_direct(reg.low_id(), src_reg.low_id());
                     },
                     .memory => |x| {
                         if (self.bin_file.options.pie) {
@@ -3886,32 +3873,28 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 return self.fail(src, "TODO implement genSetReg for PIE GOT indirection on this platform", .{});
                             }
 
+                            const abi_size = ty.abiSize(self.target.*);
+                            const encoder = try X8664Encoder.init(self.code, 7);
                             // LEA reg, [<offset>]
-                            // manually do this instruction to make sure the offset into the disp32 field won't change.
-                            try self.code.ensureCapacity(self.code.items.len + 7);
-                            self.rex(.{ .w = ty.abiSize(self.target.*) == 64, .r = reg.isExtended() });
-                            self.code.appendSliceAssumeCapacity(&[_]u8{
-                                0x8D,
-                                0x05 | (@as(u8, reg.id() & 0b111) << 3),
+                            // TODO: Check if this breaks on macho if abi_size != 64 and reg is not extended
+                            //       this causes rex byte to be omitted, which might mean the offset (+3) above is wrong.
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = reg.isExtended(),
                             });
-                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), 0);
+                            encoder.opcode_1byte(0x8D);
+                            encoder.modRm_RIPDisp32(reg.low_id());
+                            encoder.disp32(0);
 
                             // MOV reg, [reg]
-                            try self.encodeX8664Instruction(src, Instruction{
-                                .operand_size_64 = ty.abiSize(self.target.*) == 64,
-
-                                .primary_opcode_1b = 0x8B,
-
-                                .reg = reg,
-
-                                // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                //       https://github.com/ziglang/zig/issues/6515
-                                .modrm = @as(
-                                    ?Instruction.ModrmEffectiveAddress,
-                                    Instruction.ModrmEffectiveAddress{ .mem = reg },
-                                ),
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = reg.isExtended(),
+                                .b = reg.isExtended(),
                             });
-                        } else if (x <= math.maxInt(u32)) {
+                            encoder.opcode_1byte(0x8B);
+                            encoder.modRm_indirectDisp0(reg.low_id(), reg.low_id());
+                        } else if (x <= math.maxInt(i32)) {
                             // Moving from memory to a register is a variant of `8B /r`.
                             // Since we're using 64-bit moves, we require a REX.
                             // This variant also requires a SIB, as it would otherwise be RIP-relative.
@@ -3919,14 +3902,18 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // The SIB must be 0x25, to indicate a disp32 with no scaled index.
                             // 0b00RRR100, where RRR is the lower three bits of the register ID.
                             // The instruction is thus eight bytes; REX 0x8B 0b00RRR100 0x25 followed by a four-byte disp32.
-                            try self.code.ensureCapacity(self.code.items.len + 8);
-                            self.rex(.{ .w = ty.abiSize(self.target.*) == 64, .r = reg.isExtended() });
-                            self.code.appendSliceAssumeCapacity(&[_]u8{
-                                0x8B,
-                                0x04 | (@as(u8, reg.id() & 0b111) << 3), // R
-                                0x25,
+                            const abi_size = ty.abiSize(self.target.*);
+                            const encoder = try X8664Encoder.init(self.code, 8);
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = reg.isExtended(),
                             });
-                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), @intCast(u32, x));
+                            encoder.opcode_1byte(0x8B);
+                            // effective address = [SIB]
+                            encoder.modRm_SIBDisp0(reg.low_id());
+                            // SIB = disp32
+                            encoder.sib_disp32();
+                            encoder.disp32(@intCast(i32, x));
                         } else {
                             // If this is RAX, we can use a direct load; otherwise, we need to load the address, then indirectly load
                             // the value.
@@ -3935,12 +3922,12 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 // moffs64* is a 64-bit offset "relative to segment base", which really just means the
                                 // absolute address for all practical purposes.
 
-                                try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = true,
-                                    .primary_opcode_1b = 0xa1,
-                                    .immediate_bytes = 8,
-                                    .immediate = x,
+                                const encoder = try X8664Encoder.init(self.code, 10);
+                                encoder.rex(.{
+                                    .w = true,
                                 });
+                                encoder.opcode_1byte(0xA1);
+                                encoder.writeIntLittle(u64, x);
                             } else {
                                 // This requires two instructions; a move imm as used above, followed by an indirect load using the register
                                 // as the address and the register as the destination.
@@ -3957,17 +3944,17 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 // Now, the register contains the address of the value to load into it
                                 // Currently, we're only allowing 64-bit registers, so we need the `REX.W 8B /r` variant.
                                 // TODO: determine whether to allow other sized registers, and if so, handle them properly.
-                                try self.encodeX8664Instruction(src, Instruction{
-                                    .operand_size_64 = ty.abiSize(self.target.*) == 64,
-                                    .primary_opcode_1b = 0x8B,
-                                    .reg = reg,
-                                    // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                                    //       https://github.com/ziglang/zig/issues/6515
-                                    .modrm = @as(
-                                        ?Instruction.ModrmEffectiveAddress,
-                                        Instruction.ModrmEffectiveAddress{ .mem = reg },
-                                    ),
+
+                                // mov reg, [reg]
+                                const abi_size = ty.abiSize(self.target.*);
+                                const encoder = try X8664Encoder.init(self.code, 3);
+                                encoder.rex(.{
+                                    .w = abi_size == 64,
+                                    .r = reg.isExtended(),
+                                    .b = reg.isExtended(),
                                 });
+                                encoder.opcode_1byte(0x8B);
+                                encoder.modRm_indirectDisp0(reg.low_id(), reg.low_id());
                             }
                         }
                     },
@@ -3978,20 +3965,21 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             return self.fail(src, "stack offset too large", .{});
                         }
                         const ioff = -@intCast(i32, off);
-                        try self.encodeX8664Instruction(src, Instruction{
-                            .operand_size_64 = ty.abiSize(self.target.*) == 64,
-                            .primary_opcode_1b = 0x8B,
-                            .reg = reg,
-                            // TODO: Explicit optional wrap due to stage 1 miscompilation :(
-                            //       https://github.com/ziglang/zig/issues/6515
-                            .modrm = @as(
-                                ?Instruction.ModrmEffectiveAddress,
-                                Instruction.ModrmEffectiveAddress{ .mem_disp = .{
-                                    .reg = Register.ebp,
-                                    .disp = ioff,
-                                } },
-                            ),
+                        const encoder = try X8664Encoder.init(self.code, 3);
+                        encoder.rex(.{
+                            .w = abi_size == 64,
+                            .r = reg.isExtended(),
                         });
+                        encoder.opcode_1byte(0x8B);
+                        if (std.math.minInt(i8) <= ioff and ioff <= std.math.maxInt(i8)) {
+                            // Example: 48 8b 4d 7f           mov    rcx,QWORD PTR [rbp+0x7f]
+                            encoder.modRm_indirectDisp8(reg.low_id(), Register.ebp.low_id());
+                            encoder.disp8(@intCast(i8, ioff));
+                        } else {
+                            // Example: 48 8b 8d 80 00 00 00  mov    rcx,QWORD PTR [rbp+0x80]
+                            encoder.modRm_indirectDisp32(reg.low_id(), Register.ebp.low_id());
+                            encoder.disp32(ioff);
+                        }
                     },
                 },
                 else => return self.fail(src, "TODO implement getSetReg for {}", .{self.target.cpu.arch}),
diff --git a/src/codegen/x86_64.zig b/src/codegen/x86_64.zig
index 745d1b13cf..dd0b74d46a 100644
--- a/src/codegen/x86_64.zig
+++ b/src/codegen/x86_64.zig
@@ -3,6 +3,7 @@ const testing = std.testing;
 const mem = std.mem;
 const assert = std.debug.assert;
 const ArrayList = std.ArrayList;
+const Allocator = std.mem.Allocator;
 const Type = @import("../Type.zig");
 const DW = std.dwarf;
 
@@ -145,51 +146,57 @@ pub const callee_preserved_regs = [_]Register{ .rax, .rcx, .rdx, .rsi, .rdi, .r8
 pub const c_abi_int_param_regs = [_]Register{ .rdi, .rsi, .rdx, .rcx, .r8, .r9 };
 pub const c_abi_int_return_regs = [_]Register{ .rax, .rdx };
 
-/// Represents an unencoded x86 instruction.
+/// Encoding helper functions for x86_64 instructions
 ///
-/// Roughly based on the table headings at http://ref.x86asm.net/coder64.html
-pub const Instruction = struct {
-    /// Opcode prefix, needed for certain rare ops (e.g. MOVSS)
-    opcode_prefix: ?u8 = null,
-
-    /// One-byte primary opcode
-    primary_opcode_1b: ?u8 = null,
-    /// Two-byte primary opcode (always prefixed with 0f)
-    primary_opcode_2b: ?u8 = null,
-    // TODO: Support 3-byte opcodes
-
-    /// Secondary opcode
-    secondary_opcode: ?u8 = null,
-
-    /// Opcode extension (to be placed in the ModR/M byte in place of reg)
-    opcode_extension: ?u3 = null,
-
-    /// Legacy prefixes to use with this instruction
-    /// Most of the time, this field will be 0 and no prefixes are added.
-    /// Otherwise, a prefix will be added for each field set.
-    legacy_prefixes: LegacyPrefixes = .{},
-
-    /// 64-bit operand size
-    operand_size_64: bool = false,
-
-    /// The opcode-reg field,
-    /// stored in the 3 least significant bits of the opcode
-    /// on certain instructions + REX if extended
-    opcode_reg: ?Register = null,
-
-    /// The reg field
-    reg: ?Register = null,
-    /// The mod + r/m field
-    modrm: ?ModrmEffectiveAddress = null,
-    /// Location of the 3rd operand, if applicable
-    sib: ?SibEffectiveAddress = null,
-
-    /// Number of bytes of immediate
-    immediate_bytes: u8 = 0,
-    /// The value of the immediate
-    immediate: u64 = 0,
-
-    /// See legacy_prefixes
+/// Many of these helpers do very little, but they can help make things
+/// slightly more readable with more descriptive field names / function names.
+///
+/// Some of them also have asserts to ensure that we aren't doing dumb things.
+/// For example, trying to use register 4 (esp) in an indirect modr/m byte is illegal,
+/// you need to encode it with an SIB byte.
+///
+/// Note that ALL of these helper functions will assume capacity,
+/// so ensure that the `code` has sufficient capacity before using them.
+/// The `init` method is the recommended way to ensure capacity.
+pub const Encoder = struct {
+    /// Non-owning reference to the code array
+    code: *ArrayList(u8),
+
+    const Self = @This();
+
+    /// Wrap `code` in Encoder to make it easier to call these helper functions
+    ///
+    /// maximum_inst_size should contain the maximum number of bytes
+    /// that the encoded instruction will take.
+    /// This is because the helper functions will assume capacity
+    /// in order to avoid bounds checking.
+    pub fn init(code: *ArrayList(u8), maximum_inst_size: u8) !Self {
+        try code.ensureCapacity(code.items.len + maximum_inst_size);
+        return Self{ .code = code };
+    }
+
+    /// Directly write a number to the code array with big endianness
+    pub fn writeIntBig(self: Self, comptime T: type, value: T) void {
+        mem.writeIntBig(
+            T,
+            self.code.addManyAsArrayAssumeCapacity(@divExact(@typeInfo(T).Int.bits, 8)),
+            value,
+        );
+    }
+
+    /// Directly write a number to the code array with little endianness
+    pub fn writeIntLittle(self: Self, comptime T: type, value: T) void {
+        mem.writeIntLittle(
+            T,
+            self.code.addManyAsArrayAssumeCapacity(@divExact(@typeInfo(T).Int.bits, 8)),
+            value,
+        );
+    }
+
+    // --------
+    // Prefixes
+    // --------
+
     pub const LegacyPrefixes = packed struct {
         /// LOCK
         prefix_f0: bool = false,
@@ -212,322 +219,391 @@ pub const Instruction = struct {
         /// Branch taken
         prefix_3e: bool = false,
 
-        /// Operand size override
+        /// Operand size override (enables 16 bit operation)
         prefix_66: bool = false,
 
-        /// Address size override
+        /// Address size override (enables 16 bit address size)
         prefix_67: bool = false,
 
         padding: u5 = 0,
     };
 
-    /// Encodes an effective address for the Mod + R/M part of the ModR/M byte
-    ///
-    /// Note that depending on the instruction, not all effective addresses are allowed.
-    ///
-    /// Examples:
-    ///   eax:       .reg = .eax
-    ///   [eax]:     .mem = .eax
-    ///   [eax + 8]: .mem_disp = .{ .reg = .eax, .disp = 8 }
-    ///   [eax - 8]: .mem_disp = .{ .reg = .eax, .disp = -8 }
-    ///   [55]:      .disp32 = 55
-    pub const ModrmEffectiveAddress = union(enum) {
-        reg: Register,
-        mem: Register,
-        mem_disp: struct {
-            reg: Register,
-            disp: i32,
-        },
-        disp32: u32,
-
-        pub fn isExtended(self: @This()) bool {
-            return switch (self) {
-                .reg => |reg| reg.isExtended(),
-                .mem => |memea| memea.isExtended(),
-                .mem_disp => |mem_disp| mem_disp.reg.isExtended(),
-                .disp32 => false,
-            };
-        }
-    };
-
-    /// Encodes an effective address for the SIB byte
-    ///
-    /// Note that depending on the instruction, not all effective addresses are allowed.
-    ///
-    /// Examples:
-    ///   [eax + ebx * 2]:       .base_index = .{ .base = .eax, .index = .ebx, .scale = 2 }
-    ///   [eax]:                 .base_index = .{ .base = .eax, .index = null, .scale = 1 }
-    ///   [ebx * 2 + 256]:       .index_disp = .{ .index = .ebx, .scale = 2, .disp = 256 }
-    ///   [[ebp] + ebx * 2 + 8]: .ebp_index_disp = .{ .index = .ebx, .scale = 2, .disp = 8 }
-    pub const SibEffectiveAddress = union(enum) {
-        base_index: struct {
-            base: Register,
-            index: ?Register,
-            scale: u8, // 1, 2, 4, or 8
-        },
-        index_disp: struct {
-            index: ?Register,
-            scale: u8, // 1, 2, 4, or 8
-            disp: u32,
-        },
-        ebp_index_disp: struct {
-            index: ?Register,
-            scale: u8, // 1, 2, 4, or 8
-            disp: u32,
-        },
-
-        pub fn baseIsExtended(self: @This()) bool {
-            return switch (self) {
-                .base_index => |base_index| base_index.base.isExtended(),
-                .index_disp, .ebp_index_disp => false,
-            };
-        }
-
-        pub fn indexIsExtended(self: @This()) bool {
-            return switch (self) {
-                .base_index => |base_index| if (base_index.index) |idx| idx.isExtended() else false,
-                .index_disp => |index_disp| if (index_disp.index) |idx| idx.isExtended() else false,
-                .ebp_index_disp => |ebp_index_disp| if (ebp_index_disp.index) |idx| idx.isExtended() else false,
-            };
-        }
-    };
-
-    /// Writes the encoded Instruction to the code ArrayList
-    pub fn encodeInto(inst: Instruction, code: *ArrayList(u8)) !void {
-        // We need to write the following, in that order:
-        // - Legacy prefixes (0 to 13 bytes)
-        // - REX prefix (0 to 1 byte)
-        // - Opcode (1, 2, or 3 bytes)
-        // - ModR/M (0 or 1 byte)
-        // - SIB (0 or 1 byte)
-        // - Displacement (0, 1, 2, or 4 bytes)
-        // - Immediate (0, 1, 2, 4, or 8 bytes)
-
-        // By this calculation, an instruction could be up to 31 bytes long (will probably not happen)
-        try code.ensureCapacity(code.items.len + 31);
-
-        // Legacy prefixes
-        if (@bitCast(u16, inst.legacy_prefixes) != 0) {
+    /// Encodes legacy prefixes
+    pub fn legacyPrefixes(self: Self, prefixes: LegacyPrefixes) void {
+        if (@bitCast(u16, prefixes) != 0) {
             // Hopefully this path isn't taken very often, so we'll do it the slow way for now
 
             // LOCK
-            if (inst.legacy_prefixes.prefix_f0) code.appendAssumeCapacity(0xf0);
+            if (prefixes.prefix_f0) self.code.appendAssumeCapacity(0xf0);
             // REPNZ, REPNE, REP, Scalar Double-precision
-            if (inst.legacy_prefixes.prefix_f2) code.appendAssumeCapacity(0xf2);
+            if (prefixes.prefix_f2) self.code.appendAssumeCapacity(0xf2);
             // REPZ, REPE, REP, Scalar Single-precision
-            if (inst.legacy_prefixes.prefix_f3) code.appendAssumeCapacity(0xf3);
+            if (prefixes.prefix_f3) self.code.appendAssumeCapacity(0xf3);
 
             // CS segment override or Branch not taken
-            if (inst.legacy_prefixes.prefix_2e) code.appendAssumeCapacity(0x2e);
+            if (prefixes.prefix_2e) self.code.appendAssumeCapacity(0x2e);
             // DS segment override
-            if (inst.legacy_prefixes.prefix_36) code.appendAssumeCapacity(0x36);
+            if (prefixes.prefix_36) self.code.appendAssumeCapacity(0x36);
             // ES segment override
-            if (inst.legacy_prefixes.prefix_26) code.appendAssumeCapacity(0x26);
+            if (prefixes.prefix_26) self.code.appendAssumeCapacity(0x26);
             // FS segment override
-            if (inst.legacy_prefixes.prefix_64) code.appendAssumeCapacity(0x64);
+            if (prefixes.prefix_64) self.code.appendAssumeCapacity(0x64);
             // GS segment override
-            if (inst.legacy_prefixes.prefix_65) code.appendAssumeCapacity(0x65);
+            if (prefixes.prefix_65) self.code.appendAssumeCapacity(0x65);
 
             // Branch taken
-            if (inst.legacy_prefixes.prefix_3e) code.appendAssumeCapacity(0x3e);
+            if (prefixes.prefix_3e) self.code.appendAssumeCapacity(0x3e);
 
             // Operand size override
-            if (inst.legacy_prefixes.prefix_66) code.appendAssumeCapacity(0x66);
+            if (prefixes.prefix_66) self.code.appendAssumeCapacity(0x66);
 
             // Address size override
-            if (inst.legacy_prefixes.prefix_67) code.appendAssumeCapacity(0x67);
+            if (prefixes.prefix_67) self.code.appendAssumeCapacity(0x67);
         }
+    }
 
-        // REX prefix
-        //
-        // A REX prefix has the following form:
-        //   0b0100_WRXB
-        // 0100: fixed bits
-        // W: stands for "wide", indicates that the instruction uses 64-bit operands.
-        // R, X, and B each contain the 4th bit of a register
-        // these have to be set when using registers 8-15.
-        // R: stands for "reg", extends the reg field in the ModR/M byte.
-        // X: stands for "index", extends the index field in the SIB byte.
-        // B: stands for "base", extends either the r/m field in the ModR/M byte,
-        //                                      the base field in the SIB byte,
-        //                                      or the opcode reg field in the Opcode byte.
-        {
-            var value: u8 = 0x40;
-            if (inst.opcode_reg) |opcode_reg| {
-                if (opcode_reg.isExtended()) {
-                    value |= 0x1;
-                }
-            }
-            if (inst.modrm) |modrm| {
-                if (modrm.isExtended()) {
-                    value |= 0x1;
-                }
-            }
-            if (inst.sib) |sib| {
-                if (sib.baseIsExtended()) {
-                    value |= 0x1;
-                }
-                if (sib.indexIsExtended()) {
-                    value |= 0x2;
-                }
-            }
-            if (inst.reg) |reg| {
-                if (reg.isExtended()) {
-                    value |= 0x4;
-                }
-            }
-            if (inst.operand_size_64) {
-                value |= 0x8;
-            }
-            if (value != 0x40) {
-                code.appendAssumeCapacity(value);
-            }
-        }
+    /// Use 16 bit operand size
+    ///
+    /// Note that this flag is overridden by REX.W, if both are present.
+    pub fn prefix16BitMode(self: Self) void {
+        self.code.appendAssumeCapacity(0x66);
+    }
 
-        // Opcode
-        if (inst.primary_opcode_1b) |opcode| {
-            var value = opcode;
-            if (inst.opcode_reg) |opcode_reg| {
-                value |= opcode_reg.low_id();
-            }
-            code.appendAssumeCapacity(value);
-        } else if (inst.primary_opcode_2b) |opcode| {
-            code.appendAssumeCapacity(0x0f);
-            var value = opcode;
-            if (inst.opcode_reg) |opcode_reg| {
-                value |= opcode_reg.low_id();
-            }
-            code.appendAssumeCapacity(value);
-        }
+    /// From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB
+    pub const Rex = struct {
+        /// Wide, enables 64-bit operation
+        w: bool = false,
+        /// Extends the reg field in the ModR/M byte
+        r: bool = false,
+        /// Extends the index field in the SIB byte
+        x: bool = false,
+        /// Extends the r/m field in the ModR/M byte,
+        ///      or the base field in the SIB byte,
+        ///      or the reg field in the Opcode byte
+        b: bool = false,
+    };
 
-        var disp8: ?u8 = null;
-        var disp16: ?u16 = null;
-        var disp32: ?u32 = null;
-
-        // ModR/M
-        //
-        // Example ModR/M byte:
-        //   c7: ModR/M byte that contains:
-        //     11 000 111:
-        //     ^  ^   ^
-        //   mod  |   |
-        //      reg   |
-        //          r/m
-        //   where mod = 11 indicates that both operands are registers,
-        //         reg = 000 indicates that the first operand is register EAX
-        //         r/m = 111 indicates that the second operand is register EDI (since mod = 11)
-        if (inst.modrm != null or inst.reg != null or inst.opcode_extension != null) {
-            var value: u8 = 0;
-
-            // mod + rm
-            if (inst.modrm) |modrm| {
-                switch (modrm) {
-                    .reg => |reg| {
-                        value |= reg.low_id();
-                        value |= 0b11_000_000;
-                    },
-                    .mem => |memea| {
-                        assert(memea.low_id() != 4 and memea.low_id() != 5);
-                        value |= memea.low_id();
-                        // value |= 0b00_000_000;
-                    },
-                    .mem_disp => |mem_disp| {
-                        assert(mem_disp.reg.low_id() != 4);
-                        value |= mem_disp.reg.low_id();
-                        if (mem_disp.disp < 128) {
-                            // Use 1 byte of displacement
-                            value |= 0b01_000_000;
-                            disp8 = @bitCast(u8, @intCast(i8, mem_disp.disp));
-                        } else {
-                            // Use all 4 bytes of displacement
-                            value |= 0b10_000_000;
-                            disp32 = @bitCast(u32, mem_disp.disp);
-                        }
-                    },
-                    .disp32 => |d| {
-                        value |= 0b00_000_101;
-                        disp32 = d;
-                    },
-                }
-            }
-
-            // reg
-            if (inst.reg) |reg| {
-                value |= @as(u8, reg.low_id()) << 3;
-            } else if (inst.opcode_extension) |ext| {
-                value |= @as(u8, ext) << 3;
-            }
-
-            code.appendAssumeCapacity(value);
-        }
+    /// Encodes a REX prefix byte given all the fields
+    ///
+    /// Use this byte whenever you need 64 bit operation,
+    /// or one of reg, index, r/m, base, or opcode-reg might be extended.
+    ///
+    /// See struct `Rex` for a description of each field.
+    ///
+    /// Does not add a prefix byte if none of the fields are set!
+    pub fn rex(self: Self, byte: Rex) void {
+        var value: u8 = 0b0100_0000;
 
-        // SIB
-        {
-            if (inst.sib) |sib| {
-                return error.TODOSIBByteForX8664;
-            }
-        }
+        if (byte.w) value |= 0b1000;
+        if (byte.r) value |= 0b0100;
+        if (byte.x) value |= 0b0010;
+        if (byte.b) value |= 0b0001;
 
-        // Displacement
-        //
-        // The size of the displacement depends on the instruction used and is very fragile.
-        // The bytes are simply written in LE order.
-        {
-
-            // These writes won't fail because we ensured capacity earlier.
-            if (disp8) |d|
-                code.appendAssumeCapacity(d)
-            else if (disp16) |d|
-                mem.writeIntLittle(u16, code.addManyAsArrayAssumeCapacity(2), d)
-            else if (disp32) |d|
-                mem.writeIntLittle(u32, code.addManyAsArrayAssumeCapacity(4), d);
+        if (value != 0b0100_0000) {
+            self.code.appendAssumeCapacity(value);
         }
+    }
 
-        // Immediate
-        //
-        // The size of the immediate depends on the instruction used and is very fragile.
-        // The bytes are simply written in LE order.
-        {
-            // These writes won't fail because we ensured capacity earlier.
-            if (inst.immediate_bytes == 1)
-                code.appendAssumeCapacity(@intCast(u8, inst.immediate))
-            else if (inst.immediate_bytes == 2)
-                mem.writeIntLittle(u16, code.addManyAsArrayAssumeCapacity(2), @intCast(u16, inst.immediate))
-            else if (inst.immediate_bytes == 4)
-                mem.writeIntLittle(u32, code.addManyAsArrayAssumeCapacity(4), @intCast(u32, inst.immediate))
-            else if (inst.immediate_bytes == 8)
-                mem.writeIntLittle(u64, code.addManyAsArrayAssumeCapacity(8), inst.immediate);
-        }
+    // ------
+    // Opcode
+    // ------
+
+    /// Encodes a 1 byte opcode
+    pub fn opcode_1byte(self: Self, opcode: u8) void {
+        self.code.appendAssumeCapacity(opcode);
+    }
+
+    /// Encodes a 2 byte opcode
+    ///
+    /// e.g. IMUL has the opcode 0x0f 0xaf, so you use
+    ///
+    /// encoder.opcode_2byte(0x0f, 0xaf);
+    pub fn opcode_2byte(self: Self, prefix: u8, opcode: u8) void {
+        self.code.appendAssumeCapacity(prefix);
+        self.code.appendAssumeCapacity(opcode);
+    }
+
+    /// Encodes a 1 byte opcode with a reg field
+    ///
+    /// Remember to add a REX prefix byte if reg is extended!
+    pub fn opcode_withReg(self: Self, opcode: u8, reg: u3) void {
+        assert(opcode & 0b111 == 0);
+        self.code.appendAssumeCapacity(opcode | reg);
+    }
+
+    // ------
+    // ModR/M
+    // ------
+
+    /// Construct a ModR/M byte given all the fields
+    ///
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm(self: Self, mod: u2, reg_or_opx: u3, rm: u3) void {
+        self.code.appendAssumeCapacity(
+            @as(u8, mod) << 6 | @as(u8, reg_or_opx) << 3 | rm,
+        );
+    }
+
+    /// Construct a ModR/M byte using direct r/m addressing
+    /// r/m effective address: r/m
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_direct(self: Self, reg_or_opx: u3, rm: u3) void {
+        self.modRm(0b11, reg_or_opx, rm);
+    }
+
+    /// Construct a ModR/M byte using indirect r/m addressing
+    /// r/m effective address: [r/m]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_indirectDisp0(self: Self, reg_or_opx: u3, rm: u3) void {
+        assert(rm != 4 and rm != 5);
+        self.modRm(0b00, reg_or_opx, rm);
+    }
+
+    /// Construct a ModR/M byte using indirect SIB addressing
+    /// r/m effective address: [SIB]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_SIBDisp0(self: Self, reg_or_opx: u3) void {
+        self.modRm(0b00, reg_or_opx, 0b100);
+    }
+
+    /// Construct a ModR/M byte using RIP-relative addressing
+    /// r/m effective address: [RIP + disp32]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_RIPDisp32(self: Self, reg_or_opx: u3) void {
+        self.modRm(0b00, reg_or_opx, 0b101);
+    }
+
+    /// Construct a ModR/M byte using indirect r/m with a 8bit displacement
+    /// r/m effective address: [r/m + disp8]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_indirectDisp8(self: Self, reg_or_opx: u3, rm: u3) void {
+        assert(rm != 4);
+        self.modRm(0b01, reg_or_opx, rm);
+    }
+
+    /// Construct a ModR/M byte using indirect SIB with a 8bit displacement
+    /// r/m effective address: [SIB + disp8]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_SIBDisp8(self: Self, reg_or_opx: u3) void {
+        self.modRm(0b01, reg_or_opx, 0b100);
+    }
+
+    /// Construct a ModR/M byte using indirect r/m with a 32bit displacement
+    /// r/m effective address: [r/m + disp32]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_indirectDisp32(self: Self, reg_or_opx: u3, rm: u3) void {
+        assert(rm != 4);
+        self.modRm(0b10, reg_or_opx, rm);
+    }
+
+    /// Construct a ModR/M byte using indirect SIB with a 32bit displacement
+    /// r/m effective address: [SIB + disp32]
+    ///
+    /// Note reg's effective address is always just reg for the ModR/M byte.
+    /// Remember to add a REX prefix byte if reg or rm are extended!
+    pub fn modRm_SIBDisp32(self: Self, reg_or_opx: u3) void {
+        self.modRm(0b10, reg_or_opx, 0b100);
+    }
+
+    // ---
+    // SIB
+    // ---
+
+    /// Construct a SIB byte given all the fields
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib(self: Self, scale: u2, index: u3, base: u3) void {
+        self.code.appendAssumeCapacity(
+            @as(u8, scale) << 6 | @as(u8, index) << 3 | base,
+        );
+    }
+
+    /// Construct a SIB byte with scale * index + base, no frills.
+    /// r/m effective address: [base + scale * index]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_scaleIndexBase(self: Self, scale: u2, index: u3, base: u3) void {
+        assert(base != 5);
+
+        self.sib(scale, index, base);
+    }
+
+    /// Construct a SIB byte with scale * index + disp32
+    /// r/m effective address: [scale * index + disp32]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_scaleIndexDisp32(self: Self, scale: u2, index: u3) void {
+        assert(index != 4);
+
+        // scale is actually ignored
+        // index = 4 means no index
+        // base = 5 means no base, if mod == 0.
+        self.sib(scale, index, 5);
+    }
+
+    /// Construct a SIB byte with just base
+    /// r/m effective address: [base]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_base(self: Self, base: u3) void {
+        assert(base != 5);
+
+        // scale is actually ignored
+        // index = 4 means no index
+        self.sib(0, 4, base);
+    }
+
+    /// Construct a SIB byte with just disp32
+    /// r/m effective address: [disp32]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_disp32(self: Self) void {
+        // scale is actually ignored
+        // index = 4 means no index
+        // base = 5 means no base, if mod == 0.
+        self.sib(0, 4, 5);
+    }
+
+    /// Construct a SIB byte with scale * index + base + disp8
+    /// r/m effective address: [base + scale * index + disp8]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_scaleIndexBaseDisp8(self: Self, scale: u2, index: u3, base: u3) void {
+        self.sib(scale, index, base);
+    }
+
+    /// Construct a SIB byte with base + disp8, no index
+    /// r/m effective address: [base + disp8]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_baseDisp8(self: Self, base: u3) void {
+        // scale is ignored
+        // index = 4 means no index
+        self.sib(0, 4, base);
+    }
+
+    /// Construct a SIB byte with scale * index + base + disp32
+    /// r/m effective address: [base + scale * index + disp32]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_scaleIndexBaseDisp32(self: Self, scale: u2, index: u3, base: u3) void {
+        self.sib(scale, index, base);
+    }
+
+    /// Construct a SIB byte with base + disp32, no index
+    /// r/m effective address: [base + disp32]
+    ///
+    /// Remember to add a REX prefix byte if index or base are extended!
+    pub fn sib_baseDisp32(self: Self, base: u3) void {
+        // scale is ignored
+        // index = 4 means no index
+        self.sib(0, 4, base);
+    }
+
+    // -------------------------
+    // Trivial (no bit fiddling)
+    // -------------------------
+
+    /// Encode an 8 bit immediate
+    ///
+    /// It is sign-extended to 64 bits by the cpu.
+    pub fn imm8(self: Self, imm: i8) void {
+        self.code.appendAssumeCapacity(@bitCast(u8, imm));
+    }
+
+    /// Encode an 8 bit displacement
+    ///
+    /// It is sign-extended to 64 bits by the cpu.
+    pub fn disp8(self: Self, disp: i8) void {
+        self.code.appendAssumeCapacity(@bitCast(u8, disp));
+    }
+
+    /// Encode an 16 bit immediate
+    ///
+    /// It is sign-extended to 64 bits by the cpu.
+    pub fn imm16(self: Self, imm: i16) void {
+        self.writeIntLittle(i16, imm);
+    }
+
+    /// Encode an 32 bit immediate
+    ///
+    /// It is sign-extended to 64 bits by the cpu.
+    pub fn imm32(self: Self, imm: i32) void {
+        self.writeIntLittle(i32, imm);
+    }
+
+    /// Encode an 32 bit displacement
+    ///
+    /// It is sign-extended to 64 bits by the cpu.
+    pub fn disp32(self: Self, disp: i32) void {
+        self.writeIntLittle(i32, disp);
+    }
+
+    /// Encode an 64 bit immediate
+    ///
+    /// It is sign-extended to 64 bits by the cpu.
+    pub fn imm64(self: Self, imm: u64) void {
+        self.writeIntLittle(u64, imm);
     }
 };
 
-fn expectEncoded(inst: Instruction, expected: []const u8) !void {
+test "x86_64 Encoder helpers" {
     var code = ArrayList(u8).init(testing.allocator);
     defer code.deinit();
-    try inst.encodeInto(&code);
-    testing.expectEqualSlices(u8, expected, code.items);
-}
 
-test "x86_64 Instruction.encodeInto" {
     // simple integer multiplication
 
     // imul eax,edi
     // 0faf   c7
-    try expectEncoded(Instruction{
-        .primary_opcode_2b = 0xaf, // imul
-        .reg = .eax, // destination
-        .modrm = .{ .reg = .edi }, // source
-    }, &[_]u8{ 0x0f, 0xaf, 0xc7 });
+    {
+        try code.resize(0);
+        const encoder = try Encoder.init(&code, 4);
+        encoder.rex(.{
+            .r = Register.eax.isExtended(),
+            .b = Register.edi.isExtended(),
+        });
+        encoder.opcode_2byte(0x0f, 0xaf);
+        encoder.modRm_direct(
+            Register.eax.low_id(),
+            Register.edi.low_id(),
+        );
+
+        testing.expectEqualSlices(u8, &[_]u8{ 0x0f, 0xaf, 0xc7 }, code.items);
+    }
 
     // simple mov
 
     // mov eax,edi
     // 89    f8
-    try expectEncoded(Instruction{
-        .primary_opcode_1b = 0x89, // mov (with rm as destination)
-        .reg = .edi, // source
-        .modrm = .{ .reg = .eax }, // destination
-    }, &[_]u8{ 0x89, 0xf8 });
+    {
+        try code.resize(0);
+        const encoder = try Encoder.init(&code, 3);
+        encoder.rex(.{
+            .r = Register.edi.isExtended(),
+            .b = Register.eax.isExtended(),
+        });
+        encoder.opcode_1byte(0x89);
+        encoder.modRm_direct(
+            Register.edi.low_id(),
+            Register.eax.low_id(),
+        );
+
+        testing.expectEqualSlices(u8, &[_]u8{ 0x89, 0xf8 }, code.items);
+    }
 
     // signed integer addition of 32-bit sign extended immediate to 64 bit register
 
@@ -542,19 +618,19 @@ test "x86_64 Instruction.encodeInto" {
     //          :       000 <-- opcode_extension = 0 because opcode extension is /0. /0 specifies ADD
     //          :       001 <-- 001 is rcx
     // ffffff7f :  2147483647
-    try expectEncoded(Instruction{
-        // REX.W +
-        .operand_size_64 = true,
-        // 81
-        .primary_opcode_1b = 0x81,
-        // /0
-        .opcode_extension = 0,
-        // rcx
-        .modrm = .{ .reg = .rcx },
-        // immediate
-        .immediate_bytes = 4,
-        .immediate = 2147483647,
-    }, &[_]u8{ 0x48, 0x81, 0xc1, 0xff, 0xff, 0xff, 0x7f });
+    {
+        try code.resize(0);
+        const encoder = try Encoder.init(&code, 7);
+        encoder.rex(.{ .w = true }); // use 64 bit operation
+        encoder.opcode_1byte(0x81);
+        encoder.modRm_direct(
+            0,
+            Register.rcx.low_id(),
+        );
+        encoder.imm32(2147483647);
+
+        testing.expectEqualSlices(u8, &[_]u8{ 0x48, 0x81, 0xc1, 0xff, 0xff, 0xff, 0x7f }, code.items);
+    }
 }
 
 // TODO add these registers to the enum and populate dwarfLocOp
-- 
cgit v1.2.3


From b004c3da159645cf4a3387f808ab3e8a6277ba2f Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 11 Apr 2021 16:41:49 +0800
Subject: stage2 x86_64: try to fix RIP-relative offset to GOT for macho

---
 src/codegen.zig | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index 27a60597d4..e24d197d54 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -3860,32 +3860,35 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     .memory => |x| {
                         if (self.bin_file.options.pie) {
                             // RIP-relative displacement to the entry in the GOT table.
+                            const abi_size = ty.abiSize(self.target.*);
+                            const encoder = try X8664Encoder.init(self.code, 7);
+
+                            // LEA reg, [<offset>]
+
+                            // We encode the instruction FIRST because prefixes may or may not appear.
+                            // After we encode the instruction, we will know that the displacement bytes
+                            // for [<offset>] will be at self.code.items.len - 4.
+                            encoder.rex(.{
+                                .w = abi_size == 64,
+                                .r = reg.isExtended(),
+                            });
+                            encoder.opcode_1byte(0x8D);
+                            encoder.modRm_RIPDisp32(reg.low_id());
+                            encoder.disp32(0);
+
                             // TODO we should come up with our own, backend independent relocation types
                             // which each backend (Elf, MachO, etc.) would then translate into an actual
                             // fixup when linking.
                             if (self.bin_file.cast(link.File.MachO)) |macho_file| {
                                 try macho_file.pie_fixups.append(self.bin_file.allocator, .{
                                     .target_addr = x,
-                                    .offset = self.code.items.len + 3,
+                                    .offset = self.code.items.len - 4,
                                     .size = 4,
                                 });
                             } else {
                                 return self.fail(src, "TODO implement genSetReg for PIE GOT indirection on this platform", .{});
                             }
 
-                            const abi_size = ty.abiSize(self.target.*);
-                            const encoder = try X8664Encoder.init(self.code, 7);
-                            // LEA reg, [<offset>]
-                            // TODO: Check if this breaks on macho if abi_size != 64 and reg is not extended
-                            //       this causes rex byte to be omitted, which might mean the offset (+3) above is wrong.
-                            encoder.rex(.{
-                                .w = abi_size == 64,
-                                .r = reg.isExtended(),
-                            });
-                            encoder.opcode_1byte(0x8D);
-                            encoder.modRm_RIPDisp32(reg.low_id());
-                            encoder.disp32(0);
-
                             // MOV reg, [reg]
                             encoder.rex(.{
                                 .w = abi_size == 64,
-- 
cgit v1.2.3


From 62e755623fb02257ac5d8fe85be01f5056469cb5 Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 11 Apr 2021 18:46:05 +0800
Subject: stage2 x86_64: bugfix abi_size == 64 should be abi_size == 8

---
 src/codegen.zig | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index e24d197d54..0d65150bf2 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -1691,10 +1691,10 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         .register => |src_reg| {
                             // for register, register use mr + 1
                             // addressing mode: *r/m16/32/64*, r16/32/64
-                            const operand_size = dst_ty.abiSize(self.target.*);
+                            const abi_size = dst_ty.abiSize(self.target.*);
                             const encoder = try X8664Encoder.init(self.code, 3);
                             encoder.rex(.{
-                                .w = operand_size == 64,
+                                .w = abi_size == 8,
                                 .r = src_reg.isExtended(),
                                 .b = dst_reg.isExtended(),
                             });
@@ -1710,10 +1710,10 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // opx = 83: r/m16/32/64, imm8
                             const imm32 = @intCast(i32, imm); // This case must be handled before calling genX8664BinMathCode.
                             if (imm32 <= math.maxInt(i8)) {
-                                const operand_size = dst_ty.abiSize(self.target.*);
+                                const abi_size = dst_ty.abiSize(self.target.*);
                                 const encoder = try X8664Encoder.init(self.code, 4);
                                 encoder.rex(.{
-                                    .w = operand_size == 64,
+                                    .w = abi_size == 8,
                                     .b = dst_reg.isExtended(),
                                 });
                                 encoder.opcode_1byte(0x83);
@@ -1723,10 +1723,10 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 );
                                 encoder.imm8(@intCast(i8, imm32));
                             } else {
-                                const operand_size = dst_ty.abiSize(self.target.*);
+                                const abi_size = dst_ty.abiSize(self.target.*);
                                 const encoder = try X8664Encoder.init(self.code, 7);
                                 encoder.rex(.{
-                                    .w = operand_size == 64,
+                                    .w = abi_size == 8,
                                     .b = dst_reg.isExtended(),
                                 });
                                 encoder.opcode_1byte(0x81);
@@ -1750,7 +1750,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             }
                             const encoder = try X8664Encoder.init(self.code, 7);
                             encoder.rex(.{
-                                .w = abi_size == 64,
+                                .w = abi_size == 8,
                                 .r = dst_reg.isExtended(),
                             });
                             encoder.opcode_1byte(mr + 3);
@@ -1837,7 +1837,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             const abi_size = dst_ty.abiSize(self.target.*);
                             const encoder = try X8664Encoder.init(self.code, 4);
                             encoder.rex(.{
-                                .w = abi_size == 64,
+                                .w = abi_size == 8,
                                 .r = dst_reg.isExtended(),
                                 .b = src_reg.isExtended(),
                             });
@@ -1866,7 +1866,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 const abi_size = dst_ty.abiSize(self.target.*);
                                 const encoder = try X8664Encoder.init(self.code, 4);
                                 encoder.rex(.{
-                                    .w = abi_size == 64,
+                                    .w = abi_size == 8,
                                     .r = dst_reg.isExtended(),
                                     .b = dst_reg.isExtended(),
                                 });
@@ -1880,7 +1880,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 const abi_size = dst_ty.abiSize(self.target.*);
                                 const encoder = try X8664Encoder.init(self.code, 7);
                                 encoder.rex(.{
-                                    .w = abi_size == 64,
+                                    .w = abi_size == 8,
                                     .r = dst_reg.isExtended(),
                                     .b = dst_reg.isExtended(),
                                 });
@@ -1923,7 +1923,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             const abi_size = dst_ty.abiSize(self.target.*);
                             const encoder = try X8664Encoder.init(self.code, 4);
                             encoder.rex(.{
-                                .w = abi_size == 64,
+                                .w = abi_size == 8,
                                 .r = dst_reg.isExtended(),
                                 .b = src_reg.isExtended(),
                             });
@@ -1965,7 +1965,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             const i_adj_off = -@intCast(i32, adj_off);
             const encoder = try X8664Encoder.init(self.code, 7);
             encoder.rex(.{
-                .w = abi_size == 64,
+                .w = abi_size == 8,
                 .r = reg.isExtended(),
             });
             encoder.opcode_1byte(opcode);
@@ -3850,7 +3850,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         const abi_size = ty.abiSize(self.target.*);
                         const encoder = try X8664Encoder.init(self.code, 3);
                         encoder.rex(.{
-                            .w = abi_size == 64,
+                            .w = abi_size == 8,
                             .r = reg.isExtended(),
                             .b = src_reg.isExtended(),
                         });
@@ -3869,7 +3869,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // After we encode the instruction, we will know that the displacement bytes
                             // for [<offset>] will be at self.code.items.len - 4.
                             encoder.rex(.{
-                                .w = abi_size == 64,
+                                .w = abi_size == 8,
                                 .r = reg.isExtended(),
                             });
                             encoder.opcode_1byte(0x8D);
@@ -3891,7 +3891,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
 
                             // MOV reg, [reg]
                             encoder.rex(.{
-                                .w = abi_size == 64,
+                                .w = abi_size == 8,
                                 .r = reg.isExtended(),
                                 .b = reg.isExtended(),
                             });
@@ -3908,7 +3908,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             const abi_size = ty.abiSize(self.target.*);
                             const encoder = try X8664Encoder.init(self.code, 8);
                             encoder.rex(.{
-                                .w = abi_size == 64,
+                                .w = abi_size == 8,
                                 .r = reg.isExtended(),
                             });
                             encoder.opcode_1byte(0x8B);
@@ -3952,7 +3952,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                                 const abi_size = ty.abiSize(self.target.*);
                                 const encoder = try X8664Encoder.init(self.code, 3);
                                 encoder.rex(.{
-                                    .w = abi_size == 64,
+                                    .w = abi_size == 8,
                                     .r = reg.isExtended(),
                                     .b = reg.isExtended(),
                                 });
@@ -3970,7 +3970,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         const ioff = -@intCast(i32, off);
                         const encoder = try X8664Encoder.init(self.code, 3);
                         encoder.rex(.{
-                            .w = abi_size == 64,
+                            .w = abi_size == 8,
                             .r = reg.isExtended(),
                         });
                         encoder.opcode_1byte(0x8B);
-- 
cgit v1.2.3


From 1e63e8d8b6559855eca6efac6eec18f0546ecafd Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 11 Apr 2021 22:34:23 +0800
Subject: stage2 x86_64: fix codegen ensureCapacity bug for function calls

Co-authored-by: joachimschmidt557 <joachim.schmidt557@outlook.com>
---
 src/codegen.zig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index 0d65150bf2..6e734dad90 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -2424,6 +2424,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             .x86_64 => {
                                 try self.genSetReg(inst.base.src, Type.initTag(.u32), .rax, .{ .memory = got_addr });
                                 // callq *%rax
+                                try self.code.ensureCapacity(self.code.items.len + 2);
                                 self.code.appendSliceAssumeCapacity(&[2]u8{ 0xff, 0xd0 });
                             },
                             .aarch64 => {
-- 
cgit v1.2.3


From dc136627251c920713536807e0da922c140ca588 Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Sun, 11 Apr 2021 23:27:43 +0800
Subject: stage2 x86_64: force 64 bit mode when loading address of GOT

Co-authored-by: joachimschmidt557 <joachim.schmidt557@outlook.com>
---
 src/codegen.zig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index 6e734dad90..ee2f5403cc 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -2422,13 +2422,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         log.debug("got_addr = 0x{x}", .{got_addr});
                         switch (arch) {
                             .x86_64 => {
-                                try self.genSetReg(inst.base.src, Type.initTag(.u32), .rax, .{ .memory = got_addr });
+                                try self.genSetReg(inst.base.src, Type.initTag(.u64), .rax, .{ .memory = got_addr });
                                 // callq *%rax
                                 try self.code.ensureCapacity(self.code.items.len + 2);
                                 self.code.appendSliceAssumeCapacity(&[2]u8{ 0xff, 0xd0 });
                             },
                             .aarch64 => {
-                                try self.genSetReg(inst.base.src, Type.initTag(.u32), .x30, .{ .memory = got_addr });
+                                try self.genSetReg(inst.base.src, Type.initTag(.u64), .x30, .{ .memory = got_addr });
                                 // blr x30
                                 writeInt(u32, try self.code.addManyAsArray(4), Instruction.blr(.x30).toU32());
                             },
@@ -3862,7 +3862,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         if (self.bin_file.options.pie) {
                             // RIP-relative displacement to the entry in the GOT table.
                             const abi_size = ty.abiSize(self.target.*);
-                            const encoder = try X8664Encoder.init(self.code, 7);
+                            const encoder = try X8664Encoder.init(self.code, 10);
 
                             // LEA reg, [<offset>]
 
@@ -3870,7 +3870,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             // After we encode the instruction, we will know that the displacement bytes
                             // for [<offset>] will be at self.code.items.len - 4.
                             encoder.rex(.{
-                                .w = abi_size == 8,
+                                .w = true, // force 64 bit because loading an address (to the GOT)
                                 .r = reg.isExtended(),
                             });
                             encoder.opcode_1byte(0x8D);
-- 
cgit v1.2.3


From cfeb412a4263809698f941081197cd0ff7f260aa Mon Sep 17 00:00:00 2001
From: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Mon, 12 Apr 2021 01:29:40 +0800
Subject: stage2 x86_64: fix incorrect comment in genX8664BinMath

Co-authored-by: joachimschmidt557 <joachim.schmidt557@outlook.com>
---
 src/codegen.zig | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/codegen.zig')

diff --git a/src/codegen.zig b/src/codegen.zig
index ee2f5403cc..6a80a4db6b 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -1505,8 +1505,14 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         /// ADD, SUB, XOR, OR, AND
         fn genX8664BinMath(self: *Self, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst) !MCValue {
             // We'll handle these ops in two steps.
-            // 1) Prepare an output register, and put one of the arguments in it
+            // 1) Prepare an output location (register or memory)
+            //    This location will be the location of the operand that dies (if one exists)
+            //    or just a temporary register (if one doesn't exist)
             // 2) Perform the op with the other argument
+            // 3) Sometimes, the output location is memory but the op doesn't support it.
+            //    In this case, copy that location to a register, then perform the op to that register instead.
+            //
+            // TODO: make this algorithm less bad
 
             try self.code.ensureCapacity(self.code.items.len + 8);
 
-- 
cgit v1.2.3